train with full libri

2025-08-26 18:24:18 +00:00 · 2021-12-23 18:39:13 +08:00 · 2021-12-23 18:39:13 +08:00 · 3b42f0347f
commit 3b42f0347f
parent 8985440ce1
7 changed files with 874 additions and 51 deletions
--- a/egs/librispeech/ASR/conformer_ctc/code_indices.py
+++ b/egs/librispeech/ASR/conformer_ctc/code_indices.py
@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+from pathlib import Path
+from typing import List, Tuple
+from quantization import Quantizer
+
+import torch
+from asr_datamodule import LibriSpeechAsrDataModule
+from conformer import Conformer
+from lhotse.features.io import NumpyHdf5Writer
+from lhotse import CutSet
+
+from icefall.checkpoint import load_checkpoint
+from icefall.env import get_env_info
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=34,
+        help="It specifies the checkpoint to use for decoding."
+        "Note: Epoch counts from 0.",
+    )
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=1,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch'. ",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        default="data/lang_bpe_500",
+        help="The lang dir",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conformer_ctc/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--data-dir",
+        type=Path,
+        default="./data/",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--mem-dir",
+        type=Path,
+        default="conformer_ctc/exp/mem",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--quantizer-id",
+        type=str,
+        default=None,
+        help="quantizer_id",
+    )
+
+    parser.add_argument(
+        "--bytes-per-frame",
+        type=int,
+        default=4,
+        help="The number of bytes to use to quantize each memory embeddings",
+    )
+
+    parser.add_argument(
+        "--memory-embedding-dim",
+        type=int,
+        default=512,
+        help="dim of memory embeddings to train quantizer",
+    )
+
+    parser.add_argument(
+        "--pretrained-model",
+        type=Path,
+        default=None,
+        help="use a pretrained model, e.g. a modle downloaded from model zoo",
+    )
+
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default=None,
+        help="a short str to introduce which models the embeddings come from"
+        "e.g. icefall or wav2vec2",
+    )
+
+    parser.add_argument(
+        "--mem-layer",
+        type=int,
+        default=None,
+        help="which layer to extract memory embedding"
+        "Set this manully to avoid mistake.",
+    )
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    params = AttributeDict(
+        {
+            "feature_dim": 80,
+            "nhead": 8,
+            "attention_dim": 512,
+            "subsampling_factor": 4,
+            "num_decoder_layers": 6,
+            "vgg_frontend": False,
+            "use_feat_batchnorm": True,
+            "output_beam": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+    return params
+
+
+def compute_codeindices(
+    model: torch.nn.Module,
+    dl: torch.utils.data.DataLoader,
+    quantizer: None,
+    params: AttributeDict,
+    writer: None,
+) -> List[Tuple[str, List[int]]]:
+    """Compute the framewise alignments of a dataset.
+
+    Args:
+      model:
+        The neural network model.
+      dl:
+        Dataloader containing the dataset.
+      params:
+        Parameters for computing memory.
+    Returns:
+      Return a list of tuples. Each tuple contains two entries:
+        - Utterance ID
+        - memory embeddings
+    """
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+    num_cuts = 0
+
+    device = params.device
+    cuts = []
+    total_frames = 0
+    for batch_idx, batch in enumerate(dl):
+        feature = batch["inputs"]
+
+        # at entry, feature is [N, T, C]
+        assert feature.ndim == 3
+        feature = feature.to(device)
+
+        supervisions = batch["supervisions"]
+
+        _, encoder_memory, memory_mask = model(feature, supervisions)
+        codebook_indices = quantizer.encode(encoder_memory, as_bytes=True)
+
+        # [T, N, C] --> [N, T, C]
+        codebook_indices = codebook_indices.transpose(0, 1).to("cpu").numpy()
+
+        # for idx, cut in enumerate(cut_ids):
+        cut_list = supervisions["cut"]
+        assert len(cut_list) == codebook_indices.shape[0]
+        num_cuts += len(cut_list)
+        assert all(supervisions["start_frame"] == 0)
+        for idx, cut in enumerate(cut_list):
+            num_frames = (
+                ((supervisions["num_frames"][idx] - 3) // 2 + 1) - 3
+            ) // 2 + 1
+            cut.codebook_indices = writer.store_array(
+                key=cut.id,
+                value=codebook_indices[idx][:num_frames],
+                frame_shift=0.04,
+                temporal_dim=0,
+                start=0,
+            )
+            total_frames += num_frames
+
+        cuts += cut_list
+        logging.info(
+            f"processed {total_frames} frames and {num_cuts} cuts; {batch_idx} of {num_batches}"  # noqa: E501
+        )
+    return CutSet.from_cuts(cuts)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+
+    assert args.return_cuts is True
+    assert args.concatenate_cuts is False
+    assert args.quantizer_id is not None
+    assert args.model_id is not None
+    assert args.mem_layer is not None
+    assert args.pretrained_model is not None
+    assert args.subset in ["clean-100", "clean-360", "other-500"]
+
+    params = get_params()
+    params.update(vars(args))
+
+    setup_logger(f"{params.exp_dir}/log/mem")
+
+    logging.info("Computing memory embedings- started")
+    logging.info(params)
+
+    lexicon = Lexicon(params.lang_dir)
+    max_token_id = max(lexicon.tokens)
+    num_classes = max_token_id + 1  # +1 for the blank
+
+    logging.info("About to create model")
+    model = Conformer(
+        num_features=params.feature_dim,
+        nhead=params.nhead,
+        d_model=params.attention_dim,
+        num_classes=num_classes,
+        subsampling_factor=params.subsampling_factor,
+        num_decoder_layers=params.num_decoder_layers,
+        vgg_frontend=params.vgg_frontend,
+        use_feat_batchnorm=params.use_feat_batchnorm,
+    )
+
+    quantizer_fn = (
+        params.mem_dir
+        / f"{params.mem_layer}layer-{params.quantizer_id}-bytes_per_frame_{params.bytes_per_frame}-quantizer.pt"  # noqa: E501
+    )
+
+    quantizer = Quantizer(
+        dim=params.memory_embedding_dim,
+        num_codebooks=args.bytes_per_frame,
+        codebook_size=256,
+    )
+    quantizer.load_state_dict(torch.load(quantizer_fn))
+    quantizer = quantizer.to("cuda")
+
+    load_checkpoint(f"{params.pretrained_model}", model)
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    params["device"] = device
+
+    model.to(device)
+    model.eval()
+
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    train_dl = librispeech.train_dataloaders()
+
+    cdidx_dir = (
+        Path(params.data_dir)
+        / f"{args.model_id}-{args.mem_layer}layer-{args.quantizer_id}-bytes_per_frame-{args.bytes_per_frame}"  # noqa: E501
+    )
+    cdidx_dir.mkdir(exist_ok=True)
+
+    with NumpyHdf5Writer(
+        cdidx_dir
+        / f"{args.model_id}-{args.mem_layer}layer-cdidx_train-{args.subset}"
+    ) as writer:
+        cut_set = compute_codeindices(
+            model=model,
+            dl=train_dl,
+            quantizer=quantizer,
+            params=params,
+            writer=writer,
+        )
+        cut_set.to_json(cdidx_dir / f"cuts_train-{args.subset}.json.gz")
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/librispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/conformer.py
@ -23,6 +23,9 @@ from typing import Optional, Tuple
 import torch
 from torch import Tensor, nn
 from transformer import Supervisions, Transformer, encoder_padding_mask
+from prediction import JointCodebookPredictor
+from ckpnt_prediction import JointCodebookLoss
+from powerful_prediction import Powerful_JointCodebookLoss


 class CodeIndicesNet(nn.Module):
@ -51,18 +54,9 @@ class CodeIndicesNet(nn.Module):
        self.num_codebooks = num_codebooks
        self.quantizer_dim = quantizer_dim

-    def forward(self, memory):
-        """
-        Args:
-        memory:
-            memory embeddings, with shape[T, N, C]
-        output:
-            shape [N, T, num_codebooks*quantizer_dim]
-        """
-        x = self.linear1(memory)
-        return x
-
-    def loss(self, memory: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, memory: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
        """
        Args:
        memory:
@ -75,12 +69,14 @@ class CodeIndicesNet(nn.Module):
            actually it's the sum of num_codebooks CE losses
        """

-        memory = memory.transpose(0, 1)  # T, N, C --> N, T, C
-        x = self.forward(memory)
+        x = self.linear1(memory)
        x = x.reshape(-1, self.quantizer_dim)
        target = target.reshape(-1)
+        assert (
+            x.shape[0] == target.shape[0]
+        ), f"x.shape: {x.shape} while target.shape: {target.shape}"
        ret = self.ce(x, target)
-        return ret
+        return -ret, None


 class Conformer(Transformer):
@ -115,6 +111,9 @@ class Conformer(Transformer):
        normalize_before: bool = True,
        vgg_frontend: bool = False,
        use_feat_batchnorm: bool = False,
+        use_codebook_loss: bool = False,
+        num_codebooks: int = 4,
+        predictor: str = "predictor",  # "simple_linear", "predictor", "ckpnt_predictor, powerful"
    ) -> None:
        super(Conformer, self).__init__(
            num_features=num_features,
@ -150,7 +149,27 @@ class Conformer(Transformer):
            #       and throws an error without this change.
            self.after_norm = identity

-        self.cdidxnet = CodeIndicesNet()
+        if use_codebook_loss:
+            assert predictor in [
+                "powerful",
+                "predictor",
+                "ckpnt_predictor",
+                "simple_linear",
+            ]
+            if predictor == "predictor":
+                self.cdidxnet = JointCodebookPredictor(
+                    predictor_dim=512, num_codebooks=num_codebooks
+                )
+            elif predictor == "ckpnt_predictor":
+                self.cdidxnet = JointCodebookLoss(
+                    predictor_channels=512, num_codebooks=num_codebooks
+                )
+            elif predictor == "simple_linear":
+                self.cdidxnet = CodeIndicesNet(num_codebooks=num_codebooks)
+            elif predictor == "powerful":
+                self.cdidxnet = Powerful_JointCodebookLoss(
+                    predictor_channels=512, num_codebooks=num_codebooks
+                )

    def run_encoder(
        self, x: Tensor, supervisions: Optional[Supervisions] = None
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@ -499,10 +499,10 @@ def save_results(
        enable_log = True
    test_set_wers = dict()
    for key, results in results_dict.items():
+        result_file_prefix = f"epoch-{params.epoch}-avg-{params.avg}-"
        recog_path = (
            params.exp_dir
-            / f"epoch-{params.epoch}-avg-{params.avg}- \
-            recogs-{test_set_name}-{key}.txt"
+            / f"{result_file_prefix}recogs-{test_set_name}-{key}.txt"
        )
        store_transcripts(filename=recog_path, texts=results)
        if enable_log:
@ -512,8 +512,7 @@ def save_results(
        # ref/hyp pairs.
        errs_filename = (
            params.exp_dir
-            / f"epoch-{params.epoch}-avg-{params.avg}- \
-            errs-{test_set_name}-{key}.txt"
+            / f"{result_file_prefix}errs-{test_set_name}-{key}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
@ -528,9 +527,7 @@ def save_results(

    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
-        params.exp_dir
-        / f"epoch-{params.epoch}-avg-{params.avg}- \
-                wer-summary-{test_set_name}.txt"
+        params.exp_dir / f"{result_file_prefix}wer-summary-{test_set_name}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
--- a/egs/librispeech/ASR/conformer_ctc/memory_embedding.py
+++ b/egs/librispeech/ASR/conformer_ctc/memory_embedding.py
@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+from pathlib import Path
+from typing import List, Tuple
+
+import torch
+from asr_datamodule import LibriSpeechAsrDataModule
+from conformer import Conformer
+from lhotse.features.io import NumpyHdf5Writer
+from lhotse import CutSet
+
+from icefall.checkpoint import load_checkpoint
+from icefall.env import get_env_info
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=34,
+        help="It specifies the checkpoint to use for decoding."
+        "Note: Epoch counts from 0.",
+    )
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=1,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch'. ",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        default="data/lang_bpe_500",
+        help="The lang dir",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conformer_ctc/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--mem-dir",
+        type=str,
+        default="conformer_ctc/exp/mem",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--num-utts",
+        type=int,
+        default=1000,
+        help="number of utts to extract memory embeddings",
+    )
+
+    parser.add_argument(
+        "--mem-layer",
+        type=int,
+        default=None,
+        help="which layer to extract memory embedding",
+    )
+    parser.add_argument(
+        "--pretrained-model",
+        type=Path,
+        default=None,
+        help="use a pretrained model, e.g. a modle downloaded from model zoo",
+    )
+    return parser
+
+
+def get_params() -> AttributeDict:
+    params = AttributeDict(
+        {
+            "feature_dim": 80,
+            "nhead": 8,
+            "attention_dim": 512,
+            "subsampling_factor": 4,
+            "num_decoder_layers": 6,
+            "vgg_frontend": False,
+            "use_feat_batchnorm": True,
+            "output_beam": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+    return params
+
+
+def compute_memory(
+    model: torch.nn.Module,
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    writer: None,
+) -> List[Tuple[str, List[int]]]:
+    """Compute the framewise alignments of a dataset.
+
+    Args:
+      model:
+        The neural network model.
+      dl:
+        Dataloader containing the dataset.
+      params:
+        Parameters for computing memory.
+    Returns:
+      Return a list of tuples. Each tuple contains two entries:
+        - Utterance ID
+        - memory embeddings
+    """
+    num_cuts = 0
+
+    device = params.device
+    cuts = []
+    total_frames = 0
+    for batch_idx, batch in enumerate(dl):
+        feature = batch["inputs"]
+
+        # at entry, feature is [N, T, C]
+        assert feature.ndim == 3
+        feature = feature.to(device)
+
+        supervisions = batch["supervisions"]
+
+        _, encoder_memory, memory_mask = model(feature, supervisions)
+
+        # [T, N, C] --> [N, T, C]
+        encoder_memory = encoder_memory.transpose(0, 1).to("cpu").numpy()
+
+        cut_list = supervisions["cut"]
+        assert len(cut_list) == encoder_memory.shape[0]
+        assert all(supervisions["start_frame"] == 0)
+        for idx, cut in enumerate(cut_list):
+            num_frames = supervisions["num_frames"][idx]
+            cut.encoder_memory = writer.store_array(
+                key=cut.id,
+                value=encoder_memory[idx][:num_frames],
+            )
+            total_frames += num_frames
+
+        cuts += cut_list
+        num_cuts += len(cut_list)
+        logging.info(f"processed {total_frames} frames and {num_cuts} cuts.")
+        if len(cuts) > params.num_utts:
+            break
+    return CutSet.from_cuts(cuts)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+
+    assert args.return_cuts is True
+    assert args.concatenate_cuts is False
+
+    params = get_params()
+    params.update(vars(args))
+
+    setup_logger(f"{params.exp_dir}/log/mem")
+
+    logging.info("Computing memory embedings- started")
+    logging.info(params)
+
+    lexicon = Lexicon(params.lang_dir)
+    max_token_id = max(lexicon.tokens)
+    num_classes = max_token_id + 1  # +1 for the blank
+
+    logging.info("About to create model")
+    model = Conformer(
+        num_features=params.feature_dim,
+        nhead=params.nhead,
+        d_model=params.attention_dim,
+        num_classes=num_classes,
+        subsampling_factor=params.subsampling_factor,
+        num_decoder_layers=params.num_decoder_layers,
+        vgg_frontend=params.vgg_frontend,
+        use_feat_batchnorm=params.use_feat_batchnorm,
+    )
+    assert params.pretrained_model is not None
+    load_checkpoint(f"{params.pretrained_model}", model)
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    params["device"] = device
+
+    model.to(device)
+    model.eval()
+
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_dl = librispeech.test_dataloaders()  # a list
+
+    mem_dir = Path(params.mem_dir)
+    mem_dir.mkdir(exist_ok=True)
+
+    enabled_datasets = {
+        "test_clean": test_dl[0],
+    }
+
+    mem_storage = mem_dir / f"{args.mem_layer}layer-memory_embeddings"
+    mem_manifest = mem_dir / f"{args.mem_layer}layer-memory_manifest.json"
+    with NumpyHdf5Writer(mem_storage) as writer:
+        for name, dl in enabled_datasets.items():
+            cut_set = compute_memory(
+                model=model,
+                dl=dl,
+                params=params,
+                writer=writer,
+            )
+            cut_set.to_json(mem_manifest)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/librispeech/ASR/conformer_ctc/quantizer_train.py
+++ b/egs/librispeech/ASR/conformer_ctc/quantizer_train.py
@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+from lhotse import load_manifest
+from lhotse.dataset import (
+    BucketingSampler,
+    K2SpeechRecognitionDataset,
+)
+from torch.utils.data import DataLoader
+from icefall.utils import setup_logger
+import torch
+import quantization
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--bytes-per-frame",
+        type=int,
+        default=4,
+        help="The number of bytes to use to quantize each memory embeddings"
+        "Usually, it's equal to number codebooks",
+    )
+
+    parser.add_argument(
+        "--memory-embedding-dim",
+        type=int,
+        default=1024,
+        help="dim of memory embeddings to train quantizer",
+    )
+
+    parser.add_argument(
+        "--mem-dir",
+        type=Path,
+        default="conformer_ctc/exp/mem",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--output-layer-index",
+        type=int,
+        default=None,
+        help="which layer to extract memory embedding"
+        "Specify this manully every time incase of mistakes",
+    )
+
+    return parser
+
+
+def initialize_memory_dataloader(
+    mem_dir: Path = None, output_layer_index: int = None
+):
+    assert mem_dir is not None
+    assert output_layer_index is not None
+    mem_manifest_file = (
+        mem_dir / f"{output_layer_index}layer-memory_manifest.json"
+    )
+    assert os.path.isfile(
+        mem_manifest_file
+    ), f"{mem_manifest_file} does not exist."
+    cuts = load_manifest(mem_manifest_file)
+    dataset = K2SpeechRecognitionDataset(return_cuts=True)
+    max_duration = 1
+    sampler = BucketingSampler(
+        cuts,
+        max_duration=max_duration,
+        shuffle=False,
+    )
+    dl = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=4)
+    return dl
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    assert args.output_layer_index is not None
+    setup_logger(f"{args.mem_dir}/log/quantizer_train")
+    trainer = quantization.QuantizerTrainer(
+        dim=args.memory_embedding_dim,
+        bytes_per_frame=args.bytes_per_frame,
+        device=torch.device("cuda"),
+    )
+    dl = initialize_memory_dataloader(args.mem_dir, args.output_layer_index)
+    num_cuts = 0
+    done_flag = False
+    epoch = 0
+    while not trainer.done():
+        for batch in dl:
+            cuts = batch["supervisions"]["cut"]
+            embeddings = torch.cat(
+                [
+                    torch.from_numpy(c.load_custom("encoder_memory"))
+                    for c in cuts
+                ]
+            )
+            embeddings = embeddings.to("cuda")
+            num_cuts += len(cuts)
+            trainer.step(embeddings)
+            if trainer.done():
+                done_flag = True
+                break
+        if done_flag:
+            break
+        else:
+            epoch += 1
+            dl = initialize_memory_dataloader(
+                args.mem_dir, args.output_layer_index
+            )
+    quantizer = trainer.get_quantizer()
+    quantizer_fn = (
+        f"{args.output_layer_index}layer-"
+        + quantizer.get_id()
+        + f"-bytes_per_frame_{args.bytes_per_frame}-quantizer.pt"
+    )
+    quantizer_fn = args.mem_dir / quantizer_fn
+    torch.save(quantizer.state_dict(), quantizer_fn)
+
+
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.INFO)
+    main()
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@ -30,6 +30,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
+from lhotse.cut import MonoCut
 from lhotse.utils import fix_random_seed
 from lhotse.dataset.collation import collate_custom_field
 from torch import Tensor
@ -65,6 +66,13 @@ def get_parser():
        help="Number of GPUs for DDP training.",
    )

+    parser.add_argument(
+        "--bytes-per-frame",
+        type=int,
+        default=4,
+        help="number of code books",
+    )
+
    parser.add_argument(
        "--master-port",
        type=int,
@ -79,6 +87,13 @@ def get_parser():
        help="Should various information be logged in tensorboard.",
    )

+    parser.add_argument(
+        "--predictor",
+        type=str,
+        default=None,
+        help="simple_linear predictor ckpnt_predictor",
+    )
+
    parser.add_argument(
        "--num-epochs",
        type=int,
@ -103,6 +118,7 @@ def get_parser():
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
+        Note: no tailing "/".
        """,
    )

@ -128,7 +144,7 @@ def get_parser():
    parser.add_argument(
        "--codebook-weight",
        type=float,
-        default=0.1,
+        default=0.3,
        help="""The weight of code book loss.
        Note: Currently rate of ctc_loss +  rate of att_loss = 1.0
        codebook_weight is independent with previous two.
@ -142,6 +158,14 @@ def get_parser():
        help="The lr_factor for Noam optimizer",
    )

+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default=None,
+        help="a short str to introduce which models the embeddings come from"
+        "e.g. icefall or wav2vec2",
+    )
+
    return parser


@ -406,27 +430,42 @@ def compute_loss(
            )
        loss = (1.0 - params.att_rate) * ctc_loss + params.att_rate * att_loss

-    if params.codebook_weight != 0.0:
+    if params.codebook_weight > 0.0 and is_training:

        cuts = batch["supervisions"]["cut"]
        # -100 is identical to ignore_value in CE loss computation.
+        cuts_pre_mixed = [
+            c if isinstance(c, MonoCut) else c.tracks[0].cut for c in cuts
+        ]
        codebook_indices, codebook_indices_lens = collate_custom_field(
-            cuts, "codebook_indices", pad_value=-100
+            cuts_pre_mixed, "codebook_indices", pad_value=-100
        )

+        # import pdb; pdb.set_trace()
        assert (
            codebook_indices.shape[0] == encoder_memory.shape[1]
        )  # N: batch_size
-        assert (
-            codebook_indices.shape[1] == encoder_memory.shape[0]
-        )  # T: num frames
+
+        if "wav2vec" == params.model_id:
+            # frame rate of wav2vec codebooks_indices is 50
+            # while for conformer is 25
+            t_expected = encoder_memory.shape[0] * 2
+            assert codebook_indices.shape[1] >= t_expected
+            codebook_indices = codebook_indices[:, 0:t_expected:2, :]
+        encoder_memory = encoder_memory.transpose(0, 1)  # T, N, C --> N, T, C
        codebook_indices = codebook_indices.to(encoder_memory.device).long()
-        codebook_loss = mmodel.cdidxnet.loss(
-            encoder_memory, target=codebook_indices
-        )
+        if (
+            params.predictor == "ckpnt_predictor"
+            or params.predictor == "powerful"
+        ):
+            codebook_loss = mmodel.cdidxnet(encoder_memory, codebook_indices)
+        else:
+            total_logprob, _ = mmodel.cdidxnet(encoder_memory, codebook_indices)
+            codebook_loss = -total_logprob

        loss += params.codebook_weight * codebook_loss
-    else:
+
+    if params.codebook_weight == 0.0 and params.att_rate == 0.0:
        loss = ctc_loss
        att_loss = torch.tensor([0])

@ -438,7 +477,7 @@ def compute_loss(
    if params.att_rate != 0.0:
        info["att_loss"] = att_loss.detach().cpu().item()

-    if params.codebook_weight != 0.0:
+    if params.codebook_weight > 0.0 and is_training:
        info["codebook_loss"] = codebook_loss.detach().cpu().item()

    info["loss"] = loss.detach().cpu().item()
@ -633,6 +672,9 @@ def run(rank, world_size, args):
        num_decoder_layers=params.num_decoder_layers,
        vgg_frontend=False,
        use_feat_batchnorm=params.use_feat_batchnorm,
+        use_codebook_loss=True if params.codebook_weight > 0.0 else False,
+        num_codebooks=params.bytes_per_frame,
+        predictor=params.predictor,
    )

    checkpoints = load_checkpoint_if_available(params=params, model=model)
@ -747,7 +789,12 @@ def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
+    if 0.0 != args.codebook_weight:
+        assert -1 == args.time_warp_factor
+        assert not args.exp_dir.endswith("/")
+        args.exp_dir = Path(
+            f"{args.exp_dir}-time_warp_factor{args.time_warp_factor}-bytes_per_frame{args.bytes_per_frame}-cdweight{args.codebook_weight}-predictor{args.predictor}-maxduration{args.max_duration}"  # noqa: E501
+        )
    args.lang_dir = Path(args.lang_dir)

    world_size = args.world_size
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -31,7 +31,7 @@ from lhotse.dataset import (
    SingleCutSampler,
    SpecAugment,
 )
-from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.dataset.input_strategies import AudioSamples, OnTheFlyFeatures
 from torch.utils.data import DataLoader

 from icefall.dataset.datamodule import DataModule
@ -73,6 +73,21 @@ class LibriSpeechAsrDataModule(DataModule):
            help="When enabled, use 960h LibriSpeech. "
            "Otherwise, use 100h subset.",
        )
+        parser.add_argument(
+            "--subset",
+            type=Path,
+            default=None,
+            help="which subset to extract codebook index"
+            "clean-100, clean-360, other-500",
+        )
+
+        group.add_argument(
+            "--enable-augmentation",
+            type=str2bool,
+            default=True,
+            help="Set to False to disable all augmentaion."
+            "Used when extracting codebook_indexes.",
+        )
        group.add_argument(
            "--feature-dir",
            type=Path,
@ -100,6 +115,13 @@ class LibriSpeechAsrDataModule(DataModule):
            help="The number of buckets for the BucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
+        group.add_argument(
+            "--time-warp-factor",
+            type=int,
+            default=80,
+            help="Set None or less than 1 to disable"
+            "details in lhotse.lhotse.dataset.signal_transform",
+        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
@ -154,7 +176,16 @@ class LibriSpeechAsrDataModule(DataModule):
            "collect the batches.",
        )

+        group.add_argument(
+            "--input-strategy",
+            type=str,
+            default=PrecomputedFeatures,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
    def train_dataloaders(self) -> DataLoader:
+        logging.info(f"enable-augmentation: {self.args.enable_augmentation}")
        logging.info("About to get train cuts")
        cuts_train = self.train_cuts()

@ -181,6 +212,7 @@ class LibriSpeechAsrDataModule(DataModule):

        input_transforms = [
            SpecAugment(
+                time_warp_factor=self.args.time_warp_factor,
                num_frame_masks=2,
                features_mask_size=27,
                num_feature_masks=2,
@ -189,12 +221,21 @@ class LibriSpeechAsrDataModule(DataModule):
        ]

        train = K2SpeechRecognitionDataset(
-            cut_transforms=transforms,
-            input_transforms=input_transforms,
+            input_strategy=AudioSamples()
+            if self.args.input_strategy == "AudioSamples"
+            else PrecomputedFeatures(),
+            cut_transforms=transforms
+            if self.args.enable_augmentation
+            else None,
+            input_transforms=input_transforms
+            if self.args.enable_augmentation
+            else None,
            return_cuts=self.args.return_cuts,
        )

        if self.args.on_the_fly_feats:
+            assert self.args.enable_aug_mentation
+            # self.args.enable_aug_mentation==False is only tested with precomputed features.  # noqa
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
@ -222,7 +263,7 @@ class LibriSpeechAsrDataModule(DataModule):
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
-                drop_last=True,
+                drop_last=True if self.args.enable_augmentation else False,
            )
        else:
            logging.info("Using SingleCutSampler.")
@ -294,14 +335,20 @@ class LibriSpeechAsrDataModule(DataModule):

        for cuts_test in cuts:
            logging.debug("About to create test dataset")
-            test = K2SpeechRecognitionDataset(
-                input_strategy=OnTheFlyFeatures(
-                    Fbank(FbankConfig(num_mel_bins=80))
+            if self.args.input_strategy == "AudioSamples":
+                test = K2SpeechRecognitionDataset(
+                    input_strategy=AudioSamples(),
+                    return_cuts=self.args.return_cuts,
+                )
+            else:
+                test = K2SpeechRecognitionDataset(
+                    input_strategy=OnTheFlyFeatures(
+                        Fbank(FbankConfig(num_mel_bins=80))
+                    )
+                    if self.args.on_the_fly_feats
+                    else PrecomputedFeatures(),
+                    return_cuts=self.args.return_cuts,
                )
-                if self.args.on_the_fly_feats
-                else PrecomputedFeatures(),
-                return_cuts=self.args.return_cuts,
-            )
            sampler = BucketingSampler(
                cuts_test, max_duration=self.args.max_duration, shuffle=False
            )
@ -322,19 +369,26 @@ class LibriSpeechAsrDataModule(DataModule):
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        cuts_train = load_manifest(
-            self.args.feature_dir / "cuts_train-clean-100.json.gz"
-        )
        if self.args.full_libri:
+            assert self.args.subset is None
+            cuts_train = load_manifest(
+                self.args.feature_dir / "cuts_train-clean-100.json"
+            )
            cuts_train = (
                cuts_train
                + load_manifest(
-                    self.args.feature_dir / "cuts_train-clean-360.json.gz"
+                    self.args.feature_dir / "cuts_train-clean-360.json"
                )
                + load_manifest(
-                    self.args.feature_dir / "cuts_train-other-500.json.gz"
+                    self.args.feature_dir / "cuts_train-other-500.json"
                )
            )
+        if self.args.subset is not None:
+            assert not self.args.full_libri
+            assert self.args.subset in ["clean-100", "clean-360", "other-500"]
+            cuts_train = load_manifest(
+                self.args.feature_dir / f"cuts_train-{self.args.subset}.json.gz"
+            )
        return cuts_train

    @lru_cache()