Add CTC training.

2025-08-08 17:42:21 +00:00 · 2021-07-24 17:13:20 +08:00 · 2021-07-24 17:13:20 +08:00 · f3542c7793
commit f3542c7793
parent a01d08f73c
22 changed files with 2196 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 data
 __pycache__
 path.sh
 exp
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -15,7 +15,7 @@ repos:
    rev: 5.9.2
    hooks:
      - id: isort
-        args: [--profile=black]
+        args: [--profile=black, --line-length=80]
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.0.1
--- a/egs/librispeech/ASR/local/compile_hlg.py
+++ b/egs/librispeech/ASR/local/compile_hlg.py
@ -0,0 +1,82 @@
 #!/usr/bin/env python3
 """
 This script compiles HLG from
    - H, the ctc topology, built from phones contained in data/lang/lexicon.txt
    - L, the lexicon, built from data/lang/L_disambig.pt
        Caution: We use a lexicon that contains disambiguation symbols
    - G, the LM, built from data/lm/G_3_gram.fst.txt
 The generated HLG is saved in data/lm/HLG.pt
 """
 import k2
 import torch
 from icefall.lexicon import Lexicon
 def main():
    lexicon = Lexicon("data/lang")
    max_token_id = max(lexicon.tokens)
    H = k2.ctc_topo(max_token_id)
    L = k2.Fsa.from_dict(torch.load("data/lang/L_disambig.pt"))
    with open("data/lm/G_3_gram.fst.txt") as f:
        G = k2.Fsa.from_openfst(f.read(), acceptor=False)
    first_token_disambig_id = lexicon.phones["#0"]
    first_word_disambig_id = lexicon.words["#0"]
    L = k2.arc_sort(L)
    G = k2.arc_sort(G)
    print("Intersecting L and G")
    LG = k2.compose(L, G)
    print(f"LG shape: {LG.shape}")
    print("Connecting LG")
    LG = k2.connect(LG)
    print(f"LG shape after k2.connect: {LG.shape}")
    print(type(LG.aux_labels))
    print("Determinizing LG")
    LG = k2.determinize(LG)
    print(type(LG.aux_labels))
    print("Connecting LG after k2.determinize")
    LG = k2.connect(LG)
    print("Removing disambiguation symbols on LG")
    LG.labels[LG.labels >= first_token_disambig_id] = 0
    assert isinstance(LG.aux_labels, k2.RaggedInt)
    LG.aux_labels.values()[LG.aux_labels.values() >= first_word_disambig_id] = 0
    LG = k2.remove_epsilon(LG)
    print(f"LG shape after k2.remove_epsilon: {LG.shape}")
    LG = k2.connect(LG)
    LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0)
    print("Arc sorting LG")
    LG = k2.arc_sort(LG)
    print("Composing H and LG")
    HLG = k2.compose(H, LG, inner_labels="phones")
    print("Connecting LG")
    HLG = k2.connect(HLG)
    print("Arc sorting LG")
    HLG = k2.arc_sort(HLG)
    print("Saving HLG.pt to data/lm")
    torch.save(HLG.as_dict(), "data/lm/HLG.pt")
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/local/prepare_lang.py
+++ b/egs/librispeech/ASR/local/prepare_lang.py
@ -231,14 +231,18 @@ def add_self_loops(
      arcs:
        A list-of-list. The sublist contains
        `[src_state, dest_state, label, aux_label, score]`
      disambig_phone:
        It is the phone ID of the symbol `#0`.
      disambig_word:
        It is the word ID of the symbol `#0`.
    Return:
-      Return new `arcs` that contain self-loops.
+      Return new `arcs` containing self-loops.
    """
    states_needs_self_loops = set()
    for arc in arcs:
-        src, dst, ilable, olable, score = arc
+        src, dst, ilabel, olabel, score = arc
-        if olable != 0:
+        if olabel != 0:
            states_needs_self_loops.add(src)
    ans = []
@ -396,11 +400,11 @@ def main():
        sil_prob=sil_prob,
        need_self_loops=True,
    )
    # Just for debugging, will remove it
    torch.save(L.as_dict(), out_dir / "L.pt")
    torch.save(L_disambig.as_dict(), out_dir / "L_disambig.pt")
    if False:
        # Just for debugging, will remove it
        torch.save(L.as_dict(), out_dir / "L.pt")
        torch.save(L_disambig.as_dict(), out_dir / "L_disambig.pt")
        L.labels_sym = k2.SymbolTable.from_file(out_dir / "phones.txt")
        L.aux_labels_sym = k2.SymbolTable.from_file(out_dir / "words.txt")
--- a/egs/librispeech/ASR/local/test_prepare_lang.py
+++ b/egs/librispeech/ASR/local/test_prepare_lang.py
@ -80,7 +80,18 @@ def test_read_lexicon(filename: str):
    fsa_disambig.draw("L_disambig.pdf", title="L_disambig")
-if __name__ == "__main__":
+def test_lexicon():
    from icefall.lexicon import Lexicon
    lexicon = Lexicon("data/lang")
    print(lexicon.tokens)
 def main():
    filename = generate_lexicon_file()
    test_read_lexicon(filename)
    os.remove(filename)
 if __name__ == "__main__":
    test_lexicon()
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -87,3 +87,24 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  ./local/prepare_lang.py
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  echo "Stage 6: Prepare G"
  # We assume you have install kaldilm, if not, please install
  # it using: pip install kaldilm
  if [ ! -e data/lm/G_3_gram.fst.txt ]; then
    python3 -m kaldilm \
      --read-symbol-table="data/lang/words.txt" \
      --disambig-symbol='#0' \
      --max-order=3 \
      data/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt
  fi
 fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  echo "Stage 7: Compile HLG"
  if [ ! -f data/lm/HLG.pt ]; then
    python3 ./local/compile_hlg.py
  fi
 fi
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/README.md
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/README.md
@ -0,0 +1,14 @@
 ## (To be filled in)
 It will contain:
 - How to run
 - WERs
 ```bash
 cd $PWD/..
 ./prepare.sh
 ./tdnn_lstm_ctc/train.py
 ```
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/init.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/init.py
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py
@ -0,0 +1,210 @@
 #!/usr/bin/env python3
 import argparse
 import logging
 from pathlib import Path
 from typing import List, Tuple
 import k2
 import torch
 import torch.nn as nn
 from model import TdnnLstm
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.dataset.librispeech import LibriSpeechAsrDataModule
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    get_texts,
    setup_logger,
    write_error_stats,
 )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=9,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=5,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            "exp_dir": Path("tdnn_lstm_ctc/exp3/"),
            "lang_dir": Path("data/lang"),
            "feature_dim": 80,
            "subsampling_factor": 3,
            "search_beam": 20,
            "output_beam": 8,
            "min_active_states": 30,
            "max_active_states": 10000,
        }
    )
    return params
@torch.no_grad()
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    HLG: k2.Fsa,
    batch: dict,
    lexicon: Lexicon,
 ) -> List[Tuple[List[str], List[str]]]:
    """Decode one batch and return a list of tuples containing
    `(ref_words, hyp_words)`.
    Args:
      params:
        It is the return value of :func:`get_params`.
    """
    device = HLG.device
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device)
    # at entry, feature is [N, T, C]
    feature = feature.permute(0, 2, 1)  # now feature is [N, C, T]
    nnet_output = model(feature)
    # nnet_output is [N, T, C]
    supervisions = batch["supervisions"]
    supervision_segments = torch.stack(
        (
            supervisions["sequence_idx"],
            supervisions["start_frame"] // params.subsampling_factor,
            supervisions["num_frames"] // params.subsampling_factor,
        ),
        1,
    ).to(torch.int32)
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision_segments)
    lattices = k2.intersect_dense_pruned(
        HLG,
        dense_fsa_vec,
        search_beam=params.search_beam,
        output_beam=params.output_beam,
        min_active_states=params.min_active_states,
        max_active_states=params.max_active_states,
    )
    best_paths = k2.shortest_path(lattices, use_double_scores=True)
    hyps = get_texts(best_paths)
    hyps = [[lexicon.words[i] for i in ids] for ids in hyps]
    texts = supervisions["text"]
    results = []
    for hyp_words, ref_text in zip(hyps, texts):
        ref_words = ref_text.split()
        results.append((ref_words, hyp_words))
    return results
 def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    params = get_params()
    params.update(vars(args))
    setup_logger(f"{params.exp_dir}/log/log-decode")
    logging.info("Decoding started")
    logging.info(params)
    lexicon = Lexicon(params.lang_dir)
    max_phone_id = max(lexicon.tokens)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    HLG = k2.Fsa.from_dict(torch.load("data/lm/HLG.pt"))
    HLG = HLG.to(device)
    assert HLG.requires_grad is False
    model = TdnnLstm(
        num_features=params.feature_dim,
        num_classes=max_phone_id + 1,  # +1 for the blank symbol
        subsampling_factor=params.subsampling_factor,
    )
    if params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.load_state_dict(average_checkpoints(filenames))
    model.to(device)
    model.eval()
    librispeech = LibriSpeechAsrDataModule(args)
    # CAUTION: `test_sets` is for displaying only.
    # If you want to skip test-clean, you have to skip
    # it inside the for loop. That is, use
    #
    #   if test_set == 'test-clean': continue
    #
    test_sets = ["test-clean", "test-other"]
    for test_set, test_dl in zip(test_sets, librispeech.test_dataloaders()):
        tot_num_cuts = len(test_dl.dataset.cuts)
        num_cuts = 0
        results = []
        for batch_idx, batch in enumerate(test_dl):
            this_batch = decode_one_batch(
                params=params,
                model=model,
                HLG=HLG,
                batch=batch,
                lexicon=lexicon,
            )
            results.extend(this_batch)
            num_cuts += len(batch["supervisions"]["text"])
            if batch_idx % 100 == 0:
                logging.info(
                    f"batch {batch_idx}, cuts processed until now is "
                    f"{num_cuts}/{tot_num_cuts} "
                    f"({float(num_cuts)/tot_num_cuts*100:.6f}%)"
                )
        errs_filename = params.exp_dir / f"errs-{test_set}.txt"
        with open(errs_filename, "w") as f:
            write_error_stats(f, test_set, results)
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/model.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/model.py
@ -0,0 +1,86 @@
 import torch
 import torch.nn as nn
 class TdnnLstm(nn.Module):
    def __init__(
        self, num_features: int, num_classes: int, subsampling_factor: int = 3
    ) -> None:
        """
        Args:
          num_features:
            The input dimension of the model.
          num_classes:
            The output dimension of the model.
          subsampling_factor:
            It reduces the number of output frames by this factor.
        """
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.subsampling_factor = subsampling_factor
        self.tdnn = nn.Sequential(
            nn.Conv1d(
                in_channels=num_features,
                out_channels=500,
                kernel_size=3,
                stride=1,
                padding=1,
            ),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(
                in_channels=500,
                out_channels=500,
                kernel_size=3,
                stride=1,
                padding=1,
            ),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(
                in_channels=500,
                out_channels=500,
                kernel_size=3,
                stride=self.subsampling_factor,  # stride: subsampling_factor!
                padding=1,
            ),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
        )
        self.lstms = nn.ModuleList(
            [
                nn.LSTM(input_size=500, hidden_size=500, num_layers=1)
                for _ in range(5)
            ]
        )
        self.lstm_bnorms = nn.ModuleList(
            [nn.BatchNorm1d(num_features=500, affine=False) for _ in range(5)]
        )
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(in_features=500, out_features=self.num_classes)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
          x:
            Its shape is [N, C, T]
        Returns:
          The output tensor has shape [N, T, C]
        """
        x = self.tdnn(x)
        x = x.permute(2, 0, 1)  # (N, C, T) -> (T, N, C) -> how LSTM expects it
        for lstm, bnorm in zip(self.lstms, self.lstm_bnorms):
            x_new, _ = lstm(x)
            x_new = bnorm(x_new.permute(1, 2, 0)).permute(
                2, 0, 1
            )  # (T, N, C) -> (N, C, T) -> (T, N, C)
            x_new = self.dropout(x_new)
            x = x_new + x  # skip connections
        x = x.transpose(
            1, 0
        )  # (T, N, C) -> (N, T, C) -> linear expects "features" in the last dim
        x = self.linear(x)
        x = nn.functional.log_softmax(x, dim=-1)
        return x
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
@ -0,0 +1,493 @@
 #!/usr/bin/env python3
 # This is just at the very beginning ...
 import argparse
 import logging
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional
 import k2
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from model import TdnnLstm
 from torch.nn.utils import clip_grad_value_
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.tensorboard import SummaryWriter
 from icefall.checkpoint import load_checkpoint
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.dataset.librispeech import LibriSpeechAsrDataModule
 from icefall.graph_compiler import CtcTrainingGraphCompiler
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, encode_supervisions, setup_logger
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--world-size",
        type=int,
        default=1,
        help="Number of GPUs for DDP training.",
    )
    parser.add_argument(
        "--master-port",
        type=int,
        default=12354,
        help="Master port to use for DDP training.",
    )
    # TODO: add extra arguments and support DDP training.
    # Currently, only single GPU training is implemented. Will add
    # DDP training once single GPU training is finished.
    return parser
 def get_params() -> AttributeDict:
    """Return a dict containing training parameters.
    All training related parameters that are not passed from the commandline
    is saved in the variable `params`.
    Commandline options are merged into `params` after they are parsed, so
    you can also access them via `params`.
    Explanation of options saved in `params`:
        - exp_dir: It specifies the directory where all training related
                   files, e.g., checkpoints, log, etc, are saved
        - lang_dir: It contains language related input files such as
                    "lexicon.txt"
        - lr: It specifies the initial learning rate
        - feature_dim: The model input dim. It has to match the one used
                       in computing features.
        - weight_decay:  The weight_decay for the optimizer.
        - subsampling_factor:  The subsampling factor for the model.
        - start_epoch:  If it is not zero, load checkpoint `start_epoch-1`
                        and continue training from that checkpoint.
        - num_epochs:  Number of epochs to train.
        - best_train_loss: Best training loss so far. It is used to select
                           the model that has the lowest training loss. It is
                           updated during the training.
        - best_valid_loss: Best validation loss so far. It is used to select
                           the model that has the lowest validation loss. It is
                           updated during the training.
        - best_train_epoch: It is the epoch that has the best training loss.
        - best_valid_epoch: It is the epoch that has the best validation loss.
        - batch_idx_train: Used to writing statistics to tensorboard. It
                           contains number of batches trained so far across
                           epochs.
        - log_interval:  Print training loss if batch_idx % log_interval` is 0
        - valid_interval:  Run validation if batch_idx % valid_interval` is 0
        - beam_size: It is used in k2.ctc_loss
        - reduction: It is used in k2.ctc_loss
        - use_double_scores: It is used in k2.ctc_loss
    """
    params = AttributeDict(
        {
            "exp_dir": Path("tdnn_lstm_ctc/exp"),
            "lang_dir": Path("data/lang"),
            "lr": 1e-3,
            "feature_dim": 80,
            "weight_decay": 5e-4,
            "subsampling_factor": 3,
            "start_epoch": 0,
            "num_epochs": 10,
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
            "best_valid_epoch": -1,
            "batch_idx_train": 0,
            "log_interval": 10,
            "valid_interval": 1000,
            "beam_size": 10,
            "reduction": "sum",
            "use_double_scores": True,
        }
    )
    return params
 def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler._LRScheduler = None,
 ) -> None:
    """Load checkpoint from file.
    If params.start_epoch is positive, it will load the checkpoint from
    `params.start_epoch - 1`. Otherwise, this function does nothing.
    Apart from loading state dict for `model`, `optimizer` and `scheduler`,
    it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
    and `best_valid_loss` in `params`.
    Args:
      params:
        The return value of :func:`get_params`.
      model:
        The training model.
      optimizer:
        The optimizer that we are using.
      scheduler:
        The learning rate scheduler we are using.
    Returns:
      Return None.
    """
    if params.start_epoch <= 0:
        return
    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    saved_params = load_checkpoint(
        filename,
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
    )
    keys = [
        "best_train_epoch",
        "best_valid_epoch",
        "batch_idx_train",
        "best_train_loss",
        "best_valid_loss",
    ]
    for k in keys:
        params[k] = saved_params[k]
 def save_checkpoint(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler._LRScheduler,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The training model.
    """
    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
    save_checkpoint_impl(
        filename=filename,
        model=model,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
    )
    if params.best_train_epoch == params.cur_epoch:
        best_train_filename = params.exp_dir / "best-train-loss.pt"
        copyfile(src=filename, dst=best_train_filename)
    if params.best_valid_epoch == params.cur_epoch:
        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
        copyfile(src=filename, dst=best_valid_filename)
 def compute_loss(
    params: AttributeDict,
    model: nn.Module,
    batch: dict,
    graph_compiler: CtcTrainingGraphCompiler,
    is_training: bool,
 ):
    """
    Compute CTC loss given the model and its inputs.
    Args:
      params:
        Parameters for training. See :func:`get_params`.
      model:
        The model for training. It is an instance of TdnnLstm in our case.
      batch:
        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
        for the content in it.
      graph_compiler:
        It is used to build a decoding graph from a ctc topo and training
        transcript. The training transcript is contained in the given `batch`,
        while the ctc topo is built when this compiler is instantiated.
      is_training:
        True for training. False for validation. When it is True, this
        function enables autograd during computation; when it is False, it
        disables autograd.
    """
    device = graph_compiler.device
    feature = batch["inputs"]
    # at entry, feature is [N, T, C]
    feature = feature.permute(0, 2, 1)  # now feature is [N, C, T]
    assert feature.ndim == 3
    feature = feature.to(device)
    with torch.set_grad_enabled(is_training):
        nnet_output = model(feature)
        # nnet_output is [N, T, C]
    # NOTE: We need `encode_supervisions` to sort sequences with
    # different duration in decreasing order, required by
    # `k2.intersect_dense` called in `k2.ctc_loss`
    supervisions = batch["supervisions"]
    supervision_segments, texts = encode_supervisions(
        supervisions, subsampling_factor=params.subsampling_factor
    )
    decoding_graph = graph_compiler.compile(texts)
    dense_fsa_vec = k2.DenseFsaVec(
        nnet_output,
        supervision_segments,
        allow_truncate=params.subsampling_factor - 1,
    )
    loss = k2.ctc_loss(
        decoding_graph=decoding_graph,
        dense_fsa_vec=dense_fsa_vec,
        output_beam=params.beam_size,
        reduction=params.reduction,
        use_double_scores=params.use_double_scores,
    )
    assert loss.requires_grad == is_training
    # train_frames and valid_frames are used for printing.
    if is_training:
        params.train_frames = supervision_segments[:, 2].sum().item()
    else:
        params.valid_frames = supervision_segments[:, 2].sum().item()
    return loss
 def compute_validation_loss(
    params: AttributeDict,
    model: nn.Module,
    graph_compiler: CtcTrainingGraphCompiler,
    valid_dl: torch.utils.data.DataLoader,
 ) -> None:
    """Run the validation process. The validation loss
    is saved in `params.valid_loss`.
    """
    model.eval()
    tot_loss = 0.0
    tot_frames = 0.0
    for batch_idx, batch in enumerate(valid_dl):
        loss = compute_loss(
            params=params,
            model=model,
            batch=batch,
            graph_compiler=graph_compiler,
            is_training=False,
        )
        assert loss.requires_grad is False
        loss_cpu = loss.detach().cpu().item()
        tot_loss += loss_cpu
        tot_frames += params.valid_frames
    params.valid_loss = tot_loss / tot_frames
    if params.valid_loss < params.best_valid_loss:
        params.best_valid_epoch = params.cur_epoch
        params.best_valid_loss = params.valid_loss
 def train_one_epoch(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    graph_compiler: CtcTrainingGraphCompiler,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    tb_writer: Optional[SummaryWriter] = None,
 ) -> None:
    """Train the model for one epoch.
    The training loss from the mean of all frames is saved in
    `params.train_loss`. It runs the validation process every
    `params.valid_interval` batches.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The model for training.
      optimizer:
        The optimizer we are using.
      graph_compiler:
        It is used to convert transcripts to FSAs.
      train_dl:
        Dataloader for the training dataset.
      valid_dl:
        Dataloader for the validation dataset.
      tb_writer:
        Writer to write log messages to tensorboard.
    """
    model.train()
    tot_loss = 0.0  # sum of losses over all batches
    tot_frames = 0.0  # sum of frames over all batches
    for batch_idx, batch in enumerate(train_dl):
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
        loss = compute_loss(
            params=params,
            model=model,
            batch=batch,
            graph_compiler=graph_compiler,
            is_training=True,
        )
        # NOTE: We use reduction==sum and loss is computed over utterances
        # in the batch and there is no normalization to it so far.
        optimizer.zero_grad()
        loss.backward()
        clip_grad_value_(model.parameters(), 5.0)
        optimizer.step()
        loss_cpu = loss.detach().cpu().item()
        tot_frames += params.train_frames
        tot_loss += loss_cpu
        tot_avg_loss = tot_loss / tot_frames
        if batch_idx % params.log_interval == 0:
            logging.info(
                f"Epoch {params.cur_epoch}, batch {batch_idx}, "
                f"batch avg loss {loss_cpu/params.train_frames:.4f}, "
                f"total avg loss: {tot_avg_loss:.4f}, "
                f"batch size: {batch_size}"
            )
        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
            compute_validation_loss(
                params=params,
                model=model,
                graph_compiler=graph_compiler,
                valid_dl=valid_dl,
            )
            model.train()
            logging.info(
                f"Epoch {params.cur_epoch}, valid loss {params.valid_loss}, "
                f"best valid loss: {params.best_valid_loss:.4f} "
                f"best valid epoch: {params.best_valid_epoch}"
            )
    params.train_loss = tot_loss / tot_frames
    if params.train_loss < params.best_train_loss:
        params.best_train_epoch = params.cur_epoch
        params.best_train_loss = params.train_loss
 def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    params = get_params()
    params.update(vars(args))
    setup_logger(f"{params.exp_dir}/log/log-train")
    logging.info("Training started")
    logging.info(params)
    tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
    lexicon = Lexicon(params.lang_dir)
    max_phone_id = max(lexicon.tokens)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    graph_compiler = CtcTrainingGraphCompiler(lexicon=lexicon, device=device)
    model = TdnnLstm(
        num_features=params.feature_dim,
        num_classes=max_phone_id + 1,  # +1 for the blank symbol
        subsampling_factor=params.subsampling_factor,
    )
    model.to(device)
    optimizer = optim.AdamW(
        model.parameters(),
        lr=params.lr,
        weight_decay=params.weight_decay,
    )
    scheduler = StepLR(optimizer, step_size=8, gamma=0.1)
    load_checkpoint_if_available(
        params=params, model=model, optimizer=optimizer
    )
    librispeech = LibriSpeechAsrDataModule(args)
    train_dl = librispeech.train_dataloaders()
    valid_dl = librispeech.valid_dataloaders()
    for epoch in range(params.start_epoch, params.num_epochs):
        train_dl.sampler.set_epoch(epoch)
        if epoch > params.start_epoch:
            logging.info(f"epoch {epoch}, lr: {scheduler.get_last_lr()[0]}")
        if tb_writer is not None:
            tb_writer.add_scalar(
                "train/lr",
                scheduler.get_last_lr()[0],
                params.batch_idx_train,
            )
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
        params.cur_epoch = epoch
        train_one_epoch(
            params=params,
            model=model,
            optimizer=optimizer,
            graph_compiler=graph_compiler,
            train_dl=train_dl,
            valid_dl=valid_dl,
            tb_writer=tb_writer,
        )
        scheduler.step()
        save_checkpoint(
            params=params, model=model, optimizer=optimizer, scheduler=scheduler
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/icefall/checkpoint.py
+++ b/icefall/checkpoint.py
@ -0,0 +1,131 @@
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 import torch
 import torch.nn as nn
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 def save_checkpoint(
    filename: Path,
    model: Union[nn.Module, DDP],
    params: Optional[Dict[str, Any]] = None,
    optimizer: Optional[Optimizer] = None,
    scheduler: Optional[_LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    rank: int = 0,
 ) -> None:
    """Save training information to a file.
    Args:
      filename:
        The checkpoint filename.
      model:
        The model to be saved. We only save its `state_dict()`.
      params:
        User defined parameters, e.g., epoch, loss.
      optimizer:
        The optimizer to be saved. We only save its `state_dict()`.
      scheduler:
        The scheduler to be saved. We only save its `state_dict()`.
      scalar:
        The GradScaler to be saved. We only save its `state_dict()`.
      rank:
        Used in DDP. We save checkpoint only for the node whose rank is 0.
    Returns:
      Return None.
    """
    if rank != 0:
        return
    logging.info(f"Saving checkpoint to {filename}")
    if isinstance(model, DDP):
        model = model.module
    checkpoint = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict() if optimizer is not None else None,
        "scheduler": scheduler.state_dict() if scheduler is not None else None,
        "grad_scaler": scaler.state_dict() if scaler is not None else None,
    }
    if params:
        for k, v in params.items():
            assert k not in checkpoint
            checkpoint[k] = v
    torch.save(checkpoint, filename)
 def load_checkpoint(
    filename: Path,
    model: nn.Module,
    optimizer: Optional[Optimizer] = None,
    scheduler: Optional[_LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
 ) -> Dict[str, Any]:
    """
    TODO: document it
    """
    logging.info(f"Loading checkpoint from {filename}")
    checkpoint = torch.load(filename, map_location="cpu")
    if next(iter(checkpoint["model"])).startswith("module."):
        logging.info("Loading checkpoint saved by DDP")
        dst_state_dict = model.state_dict()
        src_state_dict = checkpoint["model"]
        for key in dst_state_dict.keys():
            src_key = "{}.{}".format("module", key)
            dst_state_dict[key] = src_state_dict.pop(src_key)
        assert len(src_state_dict) == 0
        model.load_state_dict(dst_state_dict, strict=False)
    else:
        model.load_state_dict(checkpoint["model"], strict=False)
    checkpoint.pop("model")
    def load(name, obj):
        s = checkpoint[name]
        if obj and s:
            obj.load_state_dict(s)
        checkpoint.pop(name)
    load("optimizer", optimizer)
    load("scheduler", scheduler)
    load("grad_scaler", scaler)
    return checkpoint
 def average_checkpoints(filenames: List[Path]) -> dict:
    """Average a list of checkpoints.
    Args:
      filenames:
        Filenames of the checkpoints to be averaged. We assume all
        checkpoints are saved by :func:`save_checkpoint`.
    Returns:
      Return a dict (i.e., state_dict) which is the average of all
      model state dicts contained in the checkpoints.
    """
    n = len(filenames)
    avg = torch.load(filenames[0], map_location="cpu")["model"]
    for i in range(1, n):
        state_dict = torch.load(filenames[i], map_location="cpu")["model"]
        for k in avg:
            avg[k] += state_dict[k]
    for k in avg:
        if avg[k].is_floating_point():
            avg[k] /= n
        else:
            avg[k] //= n
    return avg
--- a/icefall/dataset/init.py
+++ b/icefall/dataset/init.py
--- a/icefall/dataset/asr_datamodule.py
+++ b/icefall/dataset/asr_datamodule.py
@ -0,0 +1,248 @@
 import argparse
 import logging
 from pathlib import Path
 from typing import List, Union
 from lhotse import Fbank, FbankConfig, load_manifest
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    K2SpeechRecognitionDataset,
    SingleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.dataset.datamodule import DataModule
 from icefall.utils import str2bool
 class AsrDataModule(DataModule):
    """
    DataModule for K2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        super().add_arguments(parser)
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--feature-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=500.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=False,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the BucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=True,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
    def train_dataloaders(self) -> DataLoader:
        logging.info("About to get train cuts")
        cuts_train = self.train_cuts()
        logging.info("About to get Musan cuts")
        cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
        logging.info("About to create train dataset")
        transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        input_transforms = [
            SpecAugment(
                num_frame_masks=2,
                features_mask_size=27,
                num_feature_masks=2,
                frames_mask_size=100,
            )
        ]
        train = K2SpeechRecognitionDataset(
            cuts_train,
            cut_transforms=transforms,
            input_transforms=input_transforms,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            cuts_train = cuts_train.drop_features()
            train = K2SpeechRecognitionDataset(
                cuts=cuts_train,
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                input_transforms=input_transforms,
            )
        if self.args.bucketing_sampler:
            logging.info("Using BucketingSampler.")
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=True,
                num_buckets=self.args.num_buckets,
            )
        else:
            logging.info("Using SingleCutSampler.")
            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=True,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=4,
            persistent_workers=True,
        )
        return train_dl
    def valid_dataloaders(self) -> DataLoader:
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            cuts_valid = cuts_valid.drop_features()
            validate = K2SpeechRecognitionDataset(
                cuts_valid.drop_features(),
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
            )
        else:
            validate = K2SpeechRecognitionDataset(cuts_valid)
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=True,
        )
        return valid_dl
    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        cuts = self.test_cuts()
        is_list = isinstance(cuts, list)
        test_loaders = []
        if not is_list:
            cuts = [cuts]
        for cuts_test in cuts:
            logging.debug("About to create test dataset")
            test = K2SpeechRecognitionDataset(
                cuts_test,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
            )
            sampler = SingleCutSampler(
                cuts_test, max_duration=self.args.max_duration
            )
            logging.debug("About to create test dataloader")
            test_dl = DataLoader(
                test, batch_size=None, sampler=sampler, num_workers=1
            )
            test_loaders.append(test_dl)
        if is_list:
            return test_loaders
        else:
            return test_loaders[0]
--- a/icefall/dataset/datamodule.py
+++ b/icefall/dataset/datamodule.py
@ -0,0 +1,43 @@
 import argparse
 from typing import List, Union
 from lhotse import CutSet
 from torch.utils.data import DataLoader
 class DataModule:
    """
    Contains dataset-related code. It is intended to read/construct Lhotse cuts,
    and create Dataset/Sampler/DataLoader out of them.
    There is a separate method to create each of train/valid/test DataLoader.
    In principle, there might be multiple DataLoaders for each of
    train/valid/test
    (e.g. when a corpus has multiple test sets).
    The API of this class allows to return lists of CutSets/DataLoaders.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        pass
    def train_cuts(self) -> Union[CutSet, List[CutSet]]:
        raise NotImplementedError()
    def valid_cuts(self) -> Union[CutSet, List[CutSet]]:
        raise NotImplementedError()
    def test_cuts(self) -> Union[CutSet, List[CutSet]]:
        raise NotImplementedError()
    def train_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        raise NotImplementedError()
    def valid_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        raise NotImplementedError()
    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        raise NotImplementedError()
--- a/icefall/dataset/librispeech.py
+++ b/icefall/dataset/librispeech.py
@ -0,0 +1,68 @@
 import argparse
 import logging
 from functools import lru_cache
 from typing import List
 from lhotse import CutSet, load_manifest
 from icefall.dataset.asr_datamodule import AsrDataModule
 from icefall.utils import str2bool
 class LibriSpeechAsrDataModule(AsrDataModule):
    """
    LibriSpeech ASR data module. Can be used for 100h subset
    (``--full-libri false``) or full 960h set.
    The train and valid cuts for standard Libri splits are
    concatenated into a single CutSet/DataLoader.
    """
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        super().add_arguments(parser)
        group = parser.add_argument_group(title="LibriSpeech specific options")
        group.add_argument(
            "--full-libri",
            type=str2bool,
            default=True,
            help="When enabled, use 960h LibriSpeech.",
        )
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        cuts_train = load_manifest(
            self.args.feature_dir / "cuts_train-clean-100.json.gz"
        )
        if self.args.full_libri:
            cuts_train = (
                cuts_train
                + load_manifest(
                    self.args.feature_dir / "cuts_train-clean-360.json.gz"
                )
                + load_manifest(
                    self.args.feature_dir / "cuts_train-other-500.json.gz"
                )
            )
        return cuts_train
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        cuts_valid = load_manifest(
            self.args.feature_dir / "cuts_dev-clean.json.gz"
        ) + load_manifest(self.args.feature_dir / "cuts_dev-other.json.gz")
        return cuts_valid
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        test_sets = ["test-clean", "test-other"]
        cuts = []
        for test_set in test_sets:
            logging.debug("About to get test cuts")
            cuts.append(
                load_manifest(
                    self.args.feature_dir / f"cuts_{test_set}.json.gz"
                )
            )
        return cuts
--- a/icefall/graph_compiler.py
+++ b/icefall/graph_compiler.py
@ -0,0 +1,109 @@
 from typing import List
 import k2
 import torch
 from icefall.lexicon import Lexicon
 class CtcTrainingGraphCompiler(object):
    def __init__(
        self,
        lexicon: Lexicon,
        device: torch.device,
        oov: str = "<UNK>",
    ):
        """
        Args:
          lexicon:
            It is built from `data/lang/lexicon.txt`.
          device:
            The device to use for operations compiling transcripts to FSAs.
          oov:
            Out of vocabulary word. When a word in the transcript
            does not exist in the lexicon, it is replaced with `oov`.
        """
        L_inv = lexicon.L_inv.to(device)
        assert L_inv.requires_grad is False
        assert oov in lexicon.words
        self.L_inv = k2.arc_sort(L_inv)
        self.oov_id = lexicon.words[oov]
        self.words = lexicon.words
        max_token_id = max(lexicon.tokens)
        ctc_topo = k2.ctc_topo(max_token_id, modified=False)
        self.ctc_topo = ctc_topo.to(device)
        self.device = device
    def compile(self, texts: List[str]) -> k2.Fsa:
        """Build decoding graphs by composing ctc_topo with
        given transcripts.
        Args:
          texts:
            A list of strings. Each string contains a sentence for an utterance.
            A sentence consists of spaces separated words. An example `texts`
            looks like:
                ['hello icefall', 'CTC training with k2']
        Returns:
          An FsaVec, the composition result of `self.ctc_topo` and the
          transcript FSA.
        """
        transcript_fsa = self.convert_transcript_to_fsa(texts)
        # NOTE: k2.compose runs on CUDA only when treat_epsilons_specially
        # is False, so we add epsilon self-loops here
        fsa_with_self_loops = k2.remove_epsilon_and_add_self_loops(
            transcript_fsa
        )
        fsa_with_self_loops = k2.arc_sort(fsa_with_self_loops)
        decoding_graph = k2.compose(
            self.ctc_topo, fsa_with_self_loops, treat_epsilons_specially=False
        )
        assert decoding_graph.requires_grad is False
        return decoding_graph
    def convert_transcript_to_fsa(self, texts: List[str]) -> k2.Fsa:
        """Convert a list of transcript texts to an FsaVec.
        Args:
          texts:
            A list of strings. Each string contains a sentence for an utterance.
            A sentence consists of spaces separated words. An example `texts`
            looks like:
                ['hello icefall', 'CTC training with k2']
        Returns:
          Return an FsaVec, whose `shape[0]` equals to `len(texts)`.
        """
        word_ids_list = []
        for text in texts:
            word_ids = []
            for word in text.split(" "):
                if word in self.words:
                    word_ids.append(self.words[word])
                else:
                    word_ids.append(self.oov_id)
            word_ids_list.append(word_ids)
        word_fsa = k2.linear_fsa(word_ids_list, self.device)
        word_fsa_with_self_loops = k2.add_epsilon_self_loops(word_fsa)
        fsa = k2.intersect(
            self.L_inv, word_fsa_with_self_loops, treat_epsilons_specially=False
        )
        # fsa has word ID as labels and token ID as aux_labels, so
        # we need to invert it
        ans_fsa = fsa.invert_()
        return k2.arc_sort(ans_fsa)
--- a/icefall/lexicon.py
+++ b/icefall/lexicon.py
@ -0,0 +1,66 @@
 import logging
 import re
 from pathlib import Path
 from typing import List
 import k2
 import torch
 class Lexicon(object):
    """Phone based lexicon.
    TODO: Add BpeLexicon for BPE models.
    """
    def __init__(
        self, lang_dir: Path, disambig_pattern: str = re.compile(r"^#\d+$")
    ):
        """
        Args:
          lang_dir:
            Path to the lang director. It is expected to contain the following
            files:
                - phones.txt
                - words.txt
                - L.pt
            The above files are produced by the script `prepare.sh`. You
            should have run that before running the training code.
          disambig_pattern:
            It contains the pattern for disambiguation symbols.
        """
        lang_dir = Path(lang_dir)
        self.phones = k2.SymbolTable.from_file(lang_dir / "phones.txt")
        self.words = k2.SymbolTable.from_file(lang_dir / "words.txt")
        if (lang_dir / "Linv.pt").exists():
            logging.info("Loading pre-compiled Linv.pt")
            L_inv = k2.Fsa.from_dict(torch.load(lang_dir / "Linv.pt"))
        else:
            logging.info("Converting L.pt to Linv.pt")
            L = k2.Fsa.from_dict(torch.load(lang_dir / "L.pt"))
            L_inv = k2.arc_sort(L.invert())
            torch.save(L_inv.as_dict(), lang_dir / "Linv.pt")
        # We save L_inv instead of L because it will be used to intersect with
        # transcript, both of whose labels are word IDs.
        self.L_inv = L_inv
        self.disambig_pattern = disambig_pattern
    @property
    def tokens(self) -> List[int]:
        """Return a list of phone IDs excluding those from
        disambiguation symbols.
        Caution:
          0 is not a phone ID so it is excluded from the return value.
        """
        symbols = self.phones.symbols
        ans = []
        for s in symbols:
            if not self.disambig_pattern.match(s):
                ans.append(self.phones[s])
        if 0 in ans:
            ans.remove(0)
        ans.sort()
        return ans
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -1,5 +1,20 @@
 import argparse
 import logging
 import os
 import subprocess
 from collections import defaultdict
 from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, TextIO, Tuple, Union
 import k2
 import k2.ragged as k2r
 import kaldialign
 import torch
 import torch.distributed as dist
 Pathlike = Union[str, Path]
@contextmanager
@ -32,3 +47,286 @@ def get_executor():
    # No need to return anything - compute_and_store_features
    # will just instantiate the pool itself.
    yield None
 def str2bool(v):
    """Used in argparse.ArgumentParser.add_argument to indicate
    that a type is a bool type and user can enter
        - yes, true, t, y, 1, to represent True
        - no, false, f, n, 0, to represent False
    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
    """
    if isinstance(v, bool):
        return v
    if v.lower() in ("yes", "true", "t", "y", "1"):
        return True
    elif v.lower() in ("no", "false", "f", "n", "0"):
        return False
    else:
        raise argparse.ArgumentTypeError("Boolean value expected.")
 def setup_logger(
    log_filename: Pathlike, log_level: str = "info", use_console: bool = True
 ) -> None:
    """Setup log level.
    Args:
      log_filename:
        The filename to save the log.
      log_level:
        The log level to use, e.g., "debug", "info", "warning", "error",
        "critical"
    """
    now = datetime.now()
    date_time = now.strftime("%Y-%m-%d-%H-%M-%S")
    if dist.is_available() and dist.is_initialized():
        world_size = dist.get_world_size()
        rank = dist.get_rank()
        formatter = f"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] ({rank}/{world_size}) %(message)s"  # noqa
        log_filename = f"{log_filename}-{date_time}-{rank}"
    else:
        formatter = (
            "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
        )
        log_filename = f"{log_filename}-{date_time}"
    os.makedirs(os.path.dirname(log_filename), exist_ok=True)
    level = logging.ERROR
    if log_level == "debug":
        level = logging.DEBUG
    elif log_level == "info":
        level = logging.INFO
    elif log_level == "warning":
        level = logging.WARNING
    elif log_level == "critical":
        level = logging.CRITICAL
    logging.basicConfig(
        filename=log_filename, format=formatter, level=level, filemode="w"
    )
    if use_console:
        console = logging.StreamHandler()
        console.setLevel(level)
        console.setFormatter(logging.Formatter(formatter))
        logging.getLogger("").addHandler(console)
 def get_env_info():
    """
    TODO:
    """
    return {
        "k2-git-sha1": None,
        "k2-version": None,
        "lhotse-version": None,
        "torch-version": None,
        "icefall-sha1": None,
        "icefall-version": None,
    }
 # See
 # https://stackoverflow.com/questions/4984647/accessing-dict-keys-like-an-attribute  # noqa
 class AttributeDict(dict):
    __slots__ = ()
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
 def encode_supervisions(
    supervisions: Dict[str, torch.Tensor], subsampling_factor: int
 ) -> Tuple[torch.Tensor, List[str]]:
    """
    Encodes Lhotse's ``batch["supervisions"]`` dict into a pair of torch Tensor,
    and a list of transcription strings.
    The supervision tensor has shape ``(batch_size, 3)``.
    Its second dimension contains information about sequence index [0],
    start frames [1] and num frames [2].
    The batch items might become re-ordered during this operation -- the
    returned tensor and list of strings are guaranteed to be consistent with
    each other.
    """
    supervision_segments = torch.stack(
        (
            supervisions["sequence_idx"],
            supervisions["start_frame"] // subsampling_factor,
            supervisions["num_frames"] // subsampling_factor,
        ),
        1,
    ).to(torch.int32)
    indices = torch.argsort(supervision_segments[:, 2], descending=True)
    supervision_segments = supervision_segments[indices]
    texts = supervisions["text"]
    texts = [texts[idx] for idx in indices]
    return supervision_segments, texts
 def get_texts(best_paths: k2.Fsa) -> List[List[int]]:
    """Extract the texts from the best-path FSAs.
    Args:
      best_paths:
        A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e.
        containing multiple FSAs, which is expected to be the result
        of k2.shortest_path (otherwise the returned values won't
        be meaningful).
    Returns:
      Returns a list of lists of int, containing the label sequences we
      decoded.
    """
    if isinstance(best_paths.aux_labels, k2.RaggedInt):
        # remove 0's and -1's.
        aux_labels = k2r.remove_values_leq(best_paths.aux_labels, 0)
        aux_shape = k2r.compose_ragged_shapes(
            best_paths.arcs.shape(), aux_labels.shape()
        )
        # remove the states and arcs axes.
        aux_shape = k2r.remove_axis(aux_shape, 1)
        aux_shape = k2r.remove_axis(aux_shape, 1)
        aux_labels = k2.RaggedInt(aux_shape, aux_labels.values())
    else:
        # remove axis corresponding to states.
        aux_shape = k2r.remove_axis(best_paths.arcs.shape(), 1)
        aux_labels = k2.RaggedInt(aux_shape, best_paths.aux_labels)
        # remove 0's and -1's.
        aux_labels = k2r.remove_values_leq(aux_labels, 0)
    assert aux_labels.num_axes() == 2
    return k2r.to_list(aux_labels)
 def write_error_stats(
    f: TextIO, test_set_name: str, results: List[Tuple[str, str]]
 ) -> float:
    subs: Dict[Tuple[str, str], int] = defaultdict(int)
    ins: Dict[str, int] = defaultdict(int)
    dels: Dict[str, int] = defaultdict(int)
    # `words` stores counts per word, as follows:
    #   corr, ref_sub, hyp_sub, ins, dels
    words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0])
    num_corr = 0
    ERR = "*"
    for ref, hyp in results:
        ali = kaldialign.align(ref, hyp, ERR)
        for ref_word, hyp_word in ali:
            if ref_word == ERR:
                ins[hyp_word] += 1
                words[hyp_word][3] += 1
            elif hyp_word == ERR:
                dels[ref_word] += 1
                words[ref_word][4] += 1
            elif hyp_word != ref_word:
                subs[(ref_word, hyp_word)] += 1
                words[ref_word][1] += 1
                words[hyp_word][2] += 1
            else:
                words[ref_word][0] += 1
                num_corr += 1
    ref_len = sum([len(r) for r, _ in results])
    sub_errs = sum(subs.values())
    ins_errs = sum(ins.values())
    del_errs = sum(dels.values())
    tot_errs = sub_errs + ins_errs + del_errs
    tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len)
    logging.info(
        f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} "
        f"[{tot_errs} / {ref_len}, {ins_errs} ins, "
        f"{del_errs} del, {sub_errs} sub ]"
    )
    print(f"%WER = {tot_err_rate}", file=f)
    print(
        f"Errors: {ins_errs} insertions, {del_errs} deletions, "
        f"{sub_errs} substitutions, over {ref_len} reference "
        f"words ({num_corr} correct)",
        file=f,
    )
    print(
        "Search below for sections starting with PER-UTT DETAILS:, "
        "SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:",
        file=f,
    )
    print("", file=f)
    print("PER-UTT DETAILS: corr or (ref->hyp)  ", file=f)
    for ref, hyp in results:
        ali = kaldialign.align(ref, hyp, ERR)
        combine_successive_errors = True
        if combine_successive_errors:
            ali = [[[x], [y]] for x, y in ali]
            for i in range(len(ali) - 1):
                if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]:
                    ali[i + 1][0] = ali[i][0] + ali[i + 1][0]
                    ali[i + 1][1] = ali[i][1] + ali[i + 1][1]
                    ali[i] = [[], []]
            ali = [
                [
                    list(filter(lambda a: a != ERR, x)),
                    list(filter(lambda a: a != ERR, y)),
                ]
                for x, y in ali
            ]
            ali = list(filter(lambda x: x != [[], []], ali))
            ali = [
                [
                    ERR if x == [] else " ".join(x),
                    ERR if y == [] else " ".join(y),
                ]
                for x, y in ali
            ]
        print(
            " ".join(
                (
                    ref_word
                    if ref_word == hyp_word
                    else f"({ref_word}->{hyp_word})"
                    for ref_word, hyp_word in ali
                )
            ),
            file=f,
        )
    print("", file=f)
    print("SUBSTITUTIONS: count ref -> hyp", file=f)
    for count, (ref, hyp) in sorted(
        [(v, k) for k, v in subs.items()], reverse=True
    ):
        print(f"{count}   {ref} -> {hyp}", file=f)
    print("", file=f)
    print("DELETIONS: count ref", file=f)
    for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True):
        print(f"{count}   {ref}", file=f)
    print("", file=f)
    print("INSERTIONS: count hyp", file=f)
    for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True):
        print(f"{count}   {hyp}", file=f)
    print("", file=f)
    print(
        "PER-WORD STATS: word  corr tot_errs count_in_ref count_in_hyp", file=f
    )
    for _, word, counts in sorted(
        [(sum(v[1:]), k, v) for k, v in words.items()], reverse=True
    ):
        (corr, ref_sub, hyp_sub, ins, dels) = counts
        tot_errs = ref_sub + hyp_sub + ins + dels
        ref_count = corr + ref_sub + dels
        hyp_count = corr + hyp_sub + ins
        print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
    return float(tot_err_rate)
--- a/test/test_checkpoint.py
+++ b/test/test_checkpoint.py
@ -0,0 +1,50 @@
 #!/usr/bin/env python3
 import pytest
 import torch
 import torch.nn as nn
 from icefall.checkpoint import (
    average_checkpoints,
    load_checkpoint,
    save_checkpoint,
 )
@pytest.fixture
 def checkpoints1(tmp_path):
    f = tmp_path / "f.pt"
    m = nn.Module()
    m.p1 = nn.Parameter(torch.tensor([10.0, 20.0]), requires_grad=False)
    m.register_buffer("p2", torch.tensor([10, 100]))
    params = {"a": 10, "b": 20}
    save_checkpoint(f, m, params=params)
    return f
@pytest.fixture
 def checkpoints2(tmp_path):
    f = tmp_path / "f2.pt"
    m = nn.Module()
    m.p1 = nn.Parameter(torch.Tensor([50, 30.0]))
    m.register_buffer("p2", torch.tensor([1, 3]))
    params = {"a": 100, "b": 200}
    save_checkpoint(f, m, params=params)
    return f
 def test_load_checkpoints(checkpoints1):
    m = nn.Module()
    m.p1 = nn.Parameter(torch.Tensor([0, 0.0]))
    m.p2 = nn.Parameter(torch.Tensor([0, 0]))
    params = load_checkpoint(checkpoints1, m)
    assert torch.allclose(m.p1, torch.Tensor([10.0, 20]))
    assert params == {"a": 10, "b": 20}
 def test_average_checkpoints(checkpoints1, checkpoints2):
    state_dict = average_checkpoints([checkpoints1, checkpoints2])
    assert torch.allclose(state_dict["p1"], torch.Tensor([30, 25.0]))
    assert torch.allclose(state_dict["p2"], torch.tensor([5, 51]))
--- a/test/test_graph_compiler.py
+++ b/test/test_graph_compiler.py
@ -0,0 +1,160 @@
 #!/usr/bin/env python3
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 import re
 import k2
 import pytest
 import torch
 from icefall.graph_compiler import CtcTrainingGraphCompiler
 from icefall.lexicon import Lexicon
 from icefall.utils import get_texts
@pytest.fixture
 def lexicon():
    """
    We use the following test data:
    lexicon.txt
        foo f o o
        bar b a r
        baz b a z
        <UNK> SPN
    phones.txt
        <eps> 0
        a 1
        b 2
        f 3
        o 4
        r 5
        z 6
        SPN 7
    words.txt:
        <eps> 0
        foo 1
        bar 2
        baz 3
        <UNK> 4
    """
    L = k2.Fsa.from_str(
        """
        0 0 7 4 0
        0 7 -1 -1 0
        0 1 3 1 0
        0 3 2 2 0
        0 5 2 3 0
        1 2 4 0 0
        2 0 4 0 0
        3 4 1 0 0
        4 0 5 0 0
        5 6 1 0 0
        6 0 6 0 0
        7
    """,
        num_aux_labels=1,
    )
    L.labels_sym = k2.SymbolTable.from_str(
        """
        a 1
        b 2
        f 3
        o 4
        r 5
        z 6
        SPN 7
    """
    )
    L.aux_labels_sym = k2.SymbolTable.from_str(
        """
        foo 1
        bar 2
        baz 3
        <UNK> 4
    """
    )
    ans = Lexicon.__new__(Lexicon)
    ans.phones = L.labels_sym
    ans.words = L.aux_labels_sym
    ans.L_inv = k2.arc_sort(L.invert_())
    ans.disambig_pattern = re.compile(r"^#\d+$")
    return ans
@pytest.fixture
 def compiler(lexicon):
    return CtcTrainingGraphCompiler(lexicon, device=torch.device("cpu"))
 class TestCtcTrainingGraphCompiler(object):
    @staticmethod
    def test_convert_transcript_to_fsa(compiler, lexicon):
        texts = ["bar foo", "baz ok"]
        fsa = compiler.convert_transcript_to_fsa(texts)
        labels0 = fsa[0].labels[:-1].tolist()
        aux_labels0 = fsa[0].aux_labels[:-1]
        aux_labels0 = aux_labels0[aux_labels0 != 0].tolist()
        labels1 = fsa[1].labels[:-1].tolist()
        aux_labels1 = fsa[1].aux_labels[:-1]
        aux_labels1 = aux_labels1[aux_labels1 != 0].tolist()
        labels0 = [lexicon.phones[i] for i in labels0]
        labels1 = [lexicon.phones[i] for i in labels1]
        aux_labels0 = [lexicon.words[i] for i in aux_labels0]
        aux_labels1 = [lexicon.words[i] for i in aux_labels1]
        assert labels0 == ["b", "a", "r", "f", "o", "o"]
        assert aux_labels0 == ["bar", "foo"]
        assert labels1 == ["b", "a", "z", "SPN"]
        assert aux_labels1 == ["baz", "<UNK>"]
    @staticmethod
    def test_compile(compiler, lexicon):
        texts = ["bar foo", "baz ok"]
        decoding_graph = compiler.compile(texts)
        input1 = ["b", "b", "<blk>", "<blk>", "a", "a", "r", "<blk>", "<blk>"]
        input1 += ["f", "f", "<blk>", "<blk>", "o", "o", "<blk>", "o", "o"]
        input2 = ["b", "b", "a", "a", "a", "<blk>", "<blk>", "z", "z"]
        input2 += ["<blk>", "<blk>", "SPN", "SPN", "<blk>", "<blk>"]
        lexicon.phones._id2sym[0] == "<blk>"
        lexicon.phones._sym2id["<blk>"] = 0
        input1 = [lexicon.phones[i] for i in input1]
        input2 = [lexicon.phones[i] for i in input2]
        fsa1 = k2.linear_fsa(input1)
        fsa2 = k2.linear_fsa(input2)
        fsas = k2.Fsa.from_fsas([fsa1, fsa2])
        decoding_graph = k2.arc_sort(decoding_graph)
        lattice = k2.intersect(
            decoding_graph, fsas, treat_epsilons_specially=False
        )
        lattice = k2.connect(lattice)
        aux_labels0 = lattice[0].aux_labels[:-1]
        aux_labels0 = aux_labels0[aux_labels0 != 0].tolist()
        aux_labels0 = [lexicon.words[i] for i in aux_labels0]
        assert aux_labels0 == ["bar", "foo"]
        aux_labels1 = lattice[1].aux_labels[:-1]
        aux_labels1 = aux_labels1[aux_labels1 != 0].tolist()
        aux_labels1 = [lexicon.words[i] for i in aux_labels1]
        assert aux_labels1 == ["baz", "<UNK>"]
        texts = get_texts(lattice)
        texts = [[lexicon.words[i] for i in words] for words in texts]
        assert texts == [["bar", "foo"], ["baz", "<UNK>"]]
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -0,0 +1,93 @@
 #!/usr/bin/env python3
 import k2
 import pytest
 import torch
 from icefall.utils import AttributeDict, encode_supervisions, get_texts
@pytest.fixture
 def sup():
    sequence_idx = torch.tensor([0, 1, 2])
    start_frame = torch.tensor([1, 3, 9])
    num_frames = torch.tensor([20, 30, 10])
    text = ["one", "two", "three"]
    return {
        "sequence_idx": sequence_idx,
        "start_frame": start_frame,
        "num_frames": num_frames,
        "text": text,
    }
 def test_encode_supervisions(sup):
    supervision_segments, texts = encode_supervisions(sup, subsampling_factor=4)
    assert torch.all(
        torch.eq(
            supervision_segments,
            torch.tensor(
                [[1, 0, 30 // 4], [0, 0, 20 // 4], [2, 9 // 4, 10 // 4]]
            ),
        )
    )
    assert texts == ["two", "one", "three"]
 def test_get_texts_ragged():
    fsa1 = k2.Fsa.from_str(
        """
        0 1 1 10
        1 2 2 20
        2 3 3 30
        3 4 -1 0
        4
    """
    )
    fsa1.aux_labels = k2.RaggedInt("[ [1 3 0 2] [] [4 0 1] [-1]]")
    fsa2 = k2.Fsa.from_str(
        """
        0 1 1 1
        1 2 2 2
        2 3 -1 0
        3
    """
    )
    fsa2.aux_labels = k2.RaggedInt("[[3 0 5 0 8] [0 9 7 0] [-1]]")
    fsas = k2.Fsa.from_fsas([fsa1, fsa2])
    texts = get_texts(fsas)
    assert texts == [[1, 3, 2, 4, 1], [3, 5, 8, 9, 7]]
 def test_get_texts_regular():
    fsa1 = k2.Fsa.from_str(
        """
        0 1 1 3 10
        1 2 2 0 20
        2 3 3 2 30
        3 4 -1 -1 0
        4
    """,
        num_aux_labels=1,
    )
    fsa2 = k2.Fsa.from_str(
        """
        0 1 1 10 1
        1 2 2 5 2
        2 3 -1 -1 0
        3
    """,
        num_aux_labels=1,
    )
    fsas = k2.Fsa.from_fsas([fsa1, fsa2])
    texts = get_texts(fsas)
    assert texts == [[3, 2], [10, 5]]
 def test_attribute_dict():
    s = AttributeDict({"a": 10, "b": 20})
    assert s.a == 10
    assert s["b"] == 20
    s.c = 100
    assert s["c"] == 100