From 141bc10a98a37c04a3a9d9d2916d2f2961a7c66e Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sat, 16 Sep 2023 19:37:44 -0400
Subject: [PATCH] remove unnecessary test files

---
 .../WSASR/conformer_ctc2/decode.bkp           | 1015 --------------
 .../WSASR/conformer_ctc2/train_lexicon_new.py | 1177 -----------------
 2 files changed, 2192 deletions(-)
 delete mode 100755 egs/librispeech/WSASR/conformer_ctc2/decode.bkp
 delete mode 100755 egs/librispeech/WSASR/conformer_ctc2/train_lexicon_new.py

diff --git a/egs/librispeech/WSASR/conformer_ctc2/decode.bkp b/egs/librispeech/WSASR/conformer_ctc2/decode.bkp
deleted file mode 100755
index 7c3baf18c..000000000
--- a/egs/librispeech/WSASR/conformer_ctc2/decode.bkp
+++ /dev/null
@@ -1,1015 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
-#                                            Fangjun Kuang,
-#                                            Quandong Wang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
-from conformer import Conformer
-
-from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
-from icefall.checkpoint import (
-    average_checkpoints,
-    average_checkpoints_with_averaged_model,
-    find_checkpoints,
-    load_checkpoint,
-)
-from icefall.decode import (
-    get_lattice,
-    nbest_decoding,
-    nbest_oracle,
-    one_best_decoding,
-    rescore_with_attention_decoder,
-    rescore_with_n_best_list,
-    rescore_with_rnn_lm,
-    rescore_with_whole_lattice,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.rnn_lm.model import RnnLmModel
-from icefall.utils import (
-    AttributeDict,
-    get_texts,
-    load_averaged_model,
-    setup_logger,
-    store_transcripts,
-    str2bool,
-    write_error_stats,
-)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=20,
-        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 1.
-        You can specify --avg to use more checkpoints for model averaging.""",
-    )
-
-    parser.add_argument(
-        "--iter",
-        type=int,
-        default=0,
-        help="""If positive, --epoch is ignored and it
-        will use the checkpoint exp_dir/checkpoint-iter.pt.
-        You can specify --avg to use more checkpoints for model averaging.
-        """,
-    )
-
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=1,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch' and '--iter'",
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="attention-decoder",
-        help="""Decoding method.
-        Supported values are:
-            - (0) ctc-decoding. Use CTC decoding. It uses a sentence piece
-              model, i.e., lang_dir/bpe.model, to convert word pieces to words.
-              It needs neither a lexicon nor an n-gram LM.
-            - (1) ctc-greedy-search. It only use CTC output and a sentence piece
-              model for decoding. It produces the same results with ctc-decoding.
-            - (2) 1best. Extract the best path from the decoding lattice as the
-              decoding result.
-            - (3) nbest. Extract n paths from the decoding lattice; the path
-              with the highest score is the decoding result.
-            - (4) nbest-rescoring. Extract n paths from the decoding lattice,
-              rescore them with an n-gram LM (e.g., a 4-gram LM), the path with
-              the highest score is the decoding result.
-            - (5) whole-lattice-rescoring. Rescore the decoding lattice with an
-              n-gram LM (e.g., a 4-gram LM), the best path of rescored lattice
-              is the decoding result.
-            - (6) attention-decoder. Extract n paths from the LM rescored
-              lattice, the path with the highest score is the decoding result.
-            - (7) rnn-lm. Rescoring with attention-decoder and RNN LM. We assume
-              you have trained an RNN LM using ./rnn_lm/train.py
-            - (8) nbest-oracle. Its WER is the lower bound of any n-best
-              rescoring method can achieve. Useful for debugging n-best
-              rescoring method.
-        """,
-    )
-
-    parser.add_argument(
-        "--use-averaged-model",
-        type=str2bool,
-        default=True,
-        help="Whether to load averaged model. Currently it only supports "
-        "using --epoch. If True, it would decode with the averaged model "
-        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
-        "Actually only the models with epoch number of `epoch-avg` and "
-        "`epoch` are loaded for averaging. ",
-    )
-    parser.add_argument(
-        "--use-best-model",
-        type=str2bool,
-        default=False,
-    )
-
-    parser.add_argument(
-        "--num-decoder-layers",
-        type=int,
-        default=0,
-        help="""Number of decoder layer of transformer decoder.
-        Setting this to 0 will not create the decoder at all (pure CTC model)
-        """,
-    )
-
-    parser.add_argument(
-        "--num-paths",
-        type=int,
-        default=100,
-        help="""Number of paths for n-best based decoding method.
-        Used only when "method" is one of the following values:
-        nbest, nbest-rescoring, attention-decoder, rnn-lm, and nbest-oracle
-        """,
-    )
-
-    parser.add_argument(
-        "--nbest-scale",
-        type=float,
-        default=0.5,
-        help="""The scale to be applied to `lattice.scores`.
-        It's needed if you use any kinds of n-best based rescoring.
-        Used only when "method" is one of the following values:
-        nbest, nbest-rescoring, attention-decoder, rnn-lm, and nbest-oracle
-        A smaller value results in more unique paths.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="conformer_ctc2/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        default="data/lang_bpe_500",
-        help="The lang dir",
-    )
-
-    parser.add_argument(
-        "--lm-dir",
-        type=str,
-        default="data/lm",
-        help="""The n-gram LM dir.
-        It should contain either G_4_gram.pt or G_4_gram.fst.txt
-        """,
-    )
-
-    parser.add_argument(
-        "--rnn-lm-exp-dir",
-        type=str,
-        default="rnn_lm/exp",
-        help="""Used only when --method is rnn-lm.
-        It specifies the path to RNN LM exp dir.
-        """,
-    )
-
-    parser.add_argument(
-        "--rnn-lm-epoch",
-        type=int,
-        default=7,
-        help="""Used only when --method is rnn-lm.
-        It specifies the checkpoint to use.
-        """,
-    )
-
-    parser.add_argument(
-        "--rnn-lm-avg",
-        type=int,
-        default=2,
-        help="""Used only when --method is rnn-lm.
-        It specifies the number of checkpoints to average.
-        """,
-    )
-
-    parser.add_argument(
-        "--rnn-lm-embedding-dim",
-        type=int,
-        default=2048,
-        help="Embedding dim of the model",
-    )
-
-    parser.add_argument(
-        "--rnn-lm-hidden-dim",
-        type=int,
-        default=2048,
-        help="Hidden dim of the model",
-    )
-
-    parser.add_argument(
-        "--rnn-lm-num-layers",
-        type=int,
-        default=4,
-        help="Number of RNN layers the model",
-    )
-    parser.add_argument(
-        "--rnn-lm-tie-weights",
-        type=str2bool,
-        default=False,
-        help="""True to share the weights between the input embedding layer and the
-        last output linear layer
-        """,
-    )
-
-    parser.add_argument(
-        "--blank-bias",
-        type=float,
-        default=0,
-        help="""blank bias""",
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "subsampling_factor": 2,
-            "feature_dim": 768,
-            "nhead": 8,
-            "dim_feedforward": 2048,
-            "encoder_dim": 512,
-            "num_encoder_layers": 12,
-            # parameters for decoding
-            "search_beam": 20,
-            "output_beam": 50,
-            "min_active_states": 300,
-            "max_active_states": 10000,
-            "use_double_scores": True,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def ctc_greedy_search(
-    nnet_output: torch.Tensor,
-    memory: torch.Tensor,
-    memory_key_padding_mask: torch.Tensor,
-) -> List[List[int]]:
-    """Apply CTC greedy search
-
-     Args:
-         speech (torch.Tensor): (batch, max_len, feat_dim)
-         speech_length (torch.Tensor): (batch, )
-    Returns:
-         List[List[int]]: best path result
-    """
-    batch_size = memory.shape[1]
-    # Let's assume B = batch_size
-    encoder_out = memory
-    encoder_mask = memory_key_padding_mask
-    maxlen = encoder_out.size(0)
-
-    ctc_probs = nnet_output  # (B, maxlen, vocab_size)
-    topk_prob, topk_index = ctc_probs.topk(1, dim=2)  # (B, maxlen, 1)
-    topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
-    topk_index = topk_index.masked_fill_(encoder_mask, 0)  # (B, maxlen)
-    hyps = [hyp.tolist() for hyp in topk_index]
-    scores = topk_prob.max(1)
-    hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
-    return hyps, scores
-
-
-def remove_duplicates_and_blank(hyp: List[int]) -> List[int]:
-    # from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/common.py
-    new_hyp: List[int] = []
-    cur = 0
-    while cur < len(hyp):
-        if hyp[cur] != 0:
-            new_hyp.append(hyp[cur])
-        prev = cur
-        while cur < len(hyp) and hyp[cur] == hyp[prev]:
-            cur += 1
-    return new_hyp
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    model: nn.Module,
-    rnn_lm_model: Optional[nn.Module],
-    HLG: Optional[k2.Fsa],
-    H: Optional[k2.Fsa],
-    bpe_model: Optional[spm.SentencePieceProcessor],
-    batch: dict,
-    word_table: k2.SymbolTable,
-    sos_id: int,
-    eos_id: int,
-    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[List[str]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if no rescoring is used, the key is the string `no_rescore`.
-               If LM rescoring is used, the key is the string `lm_scale_xxx`,
-               where `xxx` is the value of `lm_scale`. An example key is
-               `lm_scale_0.7`
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-
-        - params.method is "1best", it uses 1best decoding without LM rescoring.
-        - params.method is "nbest", it uses nbest decoding without LM rescoring.
-        - params.method is "nbest-rescoring", it uses nbest LM rescoring.
-        - params.method is "whole-lattice-rescoring", it uses whole lattice LM
-          rescoring.
-
-      model:
-        The neural model.
-      rnn_lm_model:
-        The neural model for RNN LM.
-      HLG:
-        The decoding graph. Used only when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      bpe_model:
-        The BPE model. Used only when params.method is ctc-decoding.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      word_table:
-        The word symbol table.
-      sos_id:
-        The token ID of the SOS.
-      eos_id:
-        The token ID of the EOS.
-      G:
-        An LM. It is not None when params.method is "nbest-rescoring"
-        or "whole-lattice-rescoring". In general, the G in HLG
-        is a 3-gram LM, while this G is a 4-gram LM.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict. Note: If it decodes to nothing, then return None.
-    """
-    if HLG is not None:
-        device = HLG.device
-    else:
-        device = H.device
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-    feature = feature.to(device)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-
-    nnet_output, memory, memory_key_padding_mask = model(feature, supervisions)
-    # nnet_output is (N, T, C)
-    nnet_output[:, :, 0] += params.blank_bias
-
-    #print(f"nnet_output shape: {nnet_output.shape}")
-
-    supervision_segments = torch.stack(
-        (
-            supervisions["sequence_idx"],
-            torch.div(
-                supervisions["start_frame"],
-                params.subsampling_factor,
-                rounding_mode="trunc",
-            ),
-            torch.div(
-                supervisions["num_frames"],
-                params.subsampling_factor,
-                rounding_mode="trunc",
-            ),
-        ),
-        1,
-    ).to(torch.int32)
-
-    if H is None:
-        assert HLG is not None
-        decoding_graph = HLG
-    else:
-        assert HLG is None
-        assert bpe_model is not None
-        decoding_graph = H
-
-    lattice = get_lattice(
-        nnet_output=nnet_output,
-        decoding_graph=decoding_graph,
-        supervision_segments=supervision_segments,
-        search_beam=params.search_beam,
-        output_beam=params.output_beam,
-        min_active_states=params.min_active_states,
-        max_active_states=params.max_active_states,
-        subsampling_factor=4,
-    )
-
-    if params.method == "ctc-decoding":
-        best_path = one_best_decoding(
-            lattice=lattice, use_double_scores=params.use_double_scores
-        )
-        # Note: `best_path.aux_labels` contains token IDs, not word IDs
-        # since we are using H, not HLG here.
-        #
-        # token_ids is a lit-of-list of IDs
-        token_ids = get_texts(best_path)
-
-        # hyps is a list of str, e.g., ['xxx yyy zzz', ...]
-        hyps = bpe_model.decode(token_ids)
-
-        # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
-        hyps = [s.split() for s in hyps]
-        key = "ctc-decoding"
-        return {key: hyps}
-
-    if params.method == "ctc-greedy-search":
-        hyps, _ = ctc_greedy_search(
-            nnet_output,
-            memory,
-            memory_key_padding_mask,
-        )
-
-        # hyps is a list of str, e.g., ['xxx yyy zzz', ...]
-        hyps = bpe_model.decode(hyps)
-
-        # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
-        hyps = [s.split() for s in hyps]
-        key = "ctc-greedy-search"
-        return {key: hyps}
-
-    if params.method == "nbest-oracle":
-        # Note: You can also pass rescored lattices to it.
-        # We choose the HLG decoded lattice for speed reasons
-        # as HLG decoding is faster and the oracle WER
-        # is only slightly worse than that of rescored lattices.
-        best_path = nbest_oracle(
-            lattice=lattice,
-            num_paths=params.num_paths,
-            ref_texts=supervisions["text"],
-            word_table=word_table,
-            nbest_scale=params.nbest_scale,
-            oov="<UNK>",
-        )
-        hyps = get_texts(best_path)
-        hyps = [[word_table[i] for i in ids] for ids in hyps]
-        key = f"oracle_{params.num_paths}_nbest_scale_{params.nbest_scale}"  # noqa
-        return {key: hyps}
-
-    if params.method in ["1best", "nbest"]:
-        if params.method == "1best":
-            best_path = one_best_decoding(
-                lattice=lattice, use_double_scores=params.use_double_scores
-            )
-            key = "no_rescore"
-        else:
-            best_path = nbest_decoding(
-                lattice=lattice,
-                num_paths=params.num_paths,
-                use_double_scores=params.use_double_scores,
-                nbest_scale=params.nbest_scale,
-            )
-            key = f"no_rescore-nbest-scale-{params.nbest_scale}-{params.num_paths}"  # noqa
-
-        hyps = get_texts(best_path)
-        hyps = [[word_table[i] for i in ids] for ids in hyps]
-        return {key: hyps}
-
-    assert params.method in [
-        "nbest-rescoring",
-        "whole-lattice-rescoring",
-        "attention-decoder",
-        "rnn-lm",
-    ]
-
-    lm_scale_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
-    lm_scale_list += [0.8, 0.9, 1.0, 1.1, 1.2, 1.3]
-    lm_scale_list += [1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
-
-    if params.method == "nbest-rescoring":
-        best_path_dict = rescore_with_n_best_list(
-            lattice=lattice,
-            G=G,
-            num_paths=params.num_paths,
-            lm_scale_list=lm_scale_list,
-            nbest_scale=params.nbest_scale,
-        )
-    elif params.method == "whole-lattice-rescoring":
-        best_path_dict = rescore_with_whole_lattice(
-            lattice=lattice,
-            G_with_epsilon_loops=G,
-            lm_scale_list=lm_scale_list,
-        )
-    elif params.method == "attention-decoder":
-        # lattice uses a 3-gram Lm. We rescore it with a 4-gram LM.
-        rescored_lattice = rescore_with_whole_lattice(
-            lattice=lattice,
-            G_with_epsilon_loops=G,
-            lm_scale_list=None,
-        )
-        # TODO: pass `lattice` instead of `rescored_lattice` to
-        # `rescore_with_attention_decoder`
-
-        best_path_dict = rescore_with_attention_decoder(
-            lattice=rescored_lattice,
-            num_paths=params.num_paths,
-            model=model,
-            memory=memory,
-            memory_key_padding_mask=memory_key_padding_mask,
-            sos_id=sos_id,
-            eos_id=eos_id,
-            nbest_scale=params.nbest_scale,
-        )
-    elif params.method == "rnn-lm":
-        # lattice uses a 3-gram Lm. We rescore it with a 4-gram LM.
-        rescored_lattice = rescore_with_whole_lattice(
-            lattice=lattice,
-            G_with_epsilon_loops=G,
-            lm_scale_list=None,
-        )
-
-        best_path_dict = rescore_with_rnn_lm(
-            lattice=rescored_lattice,
-            num_paths=params.num_paths,
-            rnn_lm_model=rnn_lm_model,
-            model=model,
-            memory=memory,
-            memory_key_padding_mask=memory_key_padding_mask,
-            sos_id=sos_id,
-            eos_id=eos_id,
-            blank_id=0,
-            nbest_scale=params.nbest_scale,
-        )
-    else:
-        assert False, f"Unsupported decoding method: {params.method}"
-
-    ans = dict()
-    if best_path_dict is not None:
-        for lm_scale_str, best_path in best_path_dict.items():
-            hyps = get_texts(best_path)
-            hyps = [[word_table[i] for i in ids] for ids in hyps]
-            ans[lm_scale_str] = hyps
-    else:
-        ans = None
-    return ans
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    model: nn.Module,
-    rnn_lm_model: Optional[nn.Module],
-    HLG: Optional[k2.Fsa],
-    H: Optional[k2.Fsa],
-    bpe_model: Optional[spm.SentencePieceProcessor],
-    word_table: k2.SymbolTable,
-    sos_id: int,
-    eos_id: int,
-    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      rnn_lm_model:
-        The neural model for RNN LM.
-      HLG:
-        The decoding graph. Used only when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      bpe_model:
-        The BPE model. Used only when params.method is ctc-decoding.
-      word_table:
-        It is the word symbol table.
-      sos_id:
-        The token ID for SOS.
-      eos_id:
-        The token ID for EOS.
-      G:
-        An LM. It is not None when params.method is "nbest-rescoring"
-        or "whole-lattice-rescoring". In general, the G in HLG
-        is a 3-gram LM, while this G is a 4-gram LM.
-    Returns:
-      Return a dict, whose key may be "no-rescore" if no LM rescoring
-      is used, or it may be "lm_scale_0.7" if LM rescoring is used.
-      Its value is a list of tuples. Each tuple contains two elements:
-      The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            model=model,
-            rnn_lm_model=rnn_lm_model,
-            HLG=HLG,
-            H=H,
-            bpe_model=bpe_model,
-            batch=batch,
-            word_table=word_table,
-            G=G,
-            sos_id=sos_id,
-            eos_id=eos_id,
-        )
-
-        if hyps_dict is not None:
-            for lm_scale, hyps in hyps_dict.items():
-                this_batch = []
-                assert len(hyps) == len(texts)
-                for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
-                    ref_words = ref_text.split()
-                    this_batch.append((cut_id, ref_words, hyp_words))
-
-                results[lm_scale].extend(this_batch)
-        else:
-            assert len(results) > 0, "It should not decode to empty in the first batch!"
-            this_batch = []
-            hyp_words = []
-            for ref_text in texts:
-                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
-
-            for lm_scale in results.keys():
-                results[lm_scale].extend(this_batch)
-
-        num_cuts += len(texts)
-
-        if batch_idx % 100 == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-    if params.method in ("attention-decoder", "rnn-lm"):
-        # Set it to False since there are too many logs.
-        enable_log = False
-    else:
-        enable_log = True
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
-        if enable_log:
-            logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}.txt"
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=enable_log
-            )
-            test_set_wers[key] = wer
-
-        if enable_log:
-            logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = params.exp_dir / f"wer-summary-{test_set_name}.txt"
-    with open(errs_info, "w") as f:
-        print("settings\tWER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-    args.lang_dir = Path(args.lang_dir)
-    args.lm_dir = Path(args.lm_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode")
-    logging.info("Decoding started")
-    logging.info(params)
-
-    lexicon = Lexicon(params.lang_dir)
-    max_token_id = max(lexicon.tokens) - 1
-    num_classes = max_token_id + 1  # +1 for the blank
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"device: {device}")
-
-    graph_compiler = BpeCtcTrainingGraphCompiler(
-        params.lang_dir,
-        device=device,
-        sos_token="<sos/eos>",
-        eos_token="<sos/eos>",
-    )
-    sos_id = graph_compiler.sos_id
-    eos_id = graph_compiler.eos_id
-
-    params.num_classes = num_classes
-    params.sos_id = sos_id
-    params.eos_id = eos_id
-
-    if params.method == "ctc-decoding" or params.method == "ctc-greedy-search":
-        HLG = None
-        H = k2.ctc_topo(
-            max_token=max_token_id,
-            modified=False,
-            device=device,
-        )
-        bpe_model = spm.SentencePieceProcessor()
-        bpe_model.load(str(params.lang_dir / "bpe.model"))
-    else:
-        H = None
-        bpe_model = None
-        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
-        )
-        assert HLG.requires_grad is False
-
-        if not hasattr(HLG, "lm_scores"):
-            HLG.lm_scores = HLG.scores.clone()
-
-    if params.method in (
-        "nbest-rescoring",
-        "whole-lattice-rescoring",
-        "attention-decoder",
-        "rnn-lm",
-    ):
-        if not (params.lm_dir / "G_4_gram.pt").is_file():
-            logging.info("Loading G_4_gram.fst.txt")
-            logging.warning("It may take 8 minutes.")
-            with open(params.lm_dir / "G_4_gram.fst.txt") as f:
-                first_word_disambig_id = lexicon.word_table["#0"]
-
-                G = k2.Fsa.from_openfst(f.read(), acceptor=False)
-                # G.aux_labels is not needed in later computations, so
-                # remove it here.
-                del G.aux_labels
-                # CAUTION: The following line is crucial.
-                # Arcs entering the back-off state have label equal to #0.
-                # We have to change it to 0 here.
-                G.labels[G.labels >= first_word_disambig_id] = 0
-                # See https://github.com/k2-fsa/k2/issues/874
-                # for why we need to set G.properties to None
-                G.__dict__["_properties"] = None
-                G = k2.Fsa.from_fsas([G]).to(device)
-                G = k2.arc_sort(G)
-                # Save a dummy value so that it can be loaded in C++.
-                # See https://github.com/pytorch/pytorch/issues/67902
-                # for why we need to do this.
-                G.dummy = 1
-
-                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
-        else:
-            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
-            G = k2.Fsa.from_dict(d)
-
-        if params.method in [
-            "whole-lattice-rescoring",
-            "attention-decoder",
-            "rnn-lm",
-        ]:
-            # Add epsilon self-loops to G as we will compose
-            # it with the whole lattice later
-            G = k2.add_epsilon_self_loops(G)
-            G = k2.arc_sort(G)
-            G = G.to(device)
-
-        # G.lm_scores is used to replace HLG.lm_scores during
-        # LM rescoring.
-        G.lm_scores = G.scores.clone()
-    else:
-        G = None
-
-    model = Conformer(
-        num_features=params.feature_dim,
-        nhead=params.nhead,
-        d_model=params.encoder_dim,
-        num_classes=num_classes,
-        subsampling_factor=params.subsampling_factor,
-        num_encoder_layers=params.num_encoder_layers,
-        num_decoder_layers=params.num_decoder_layers,
-    )
-
-    if params.use_best_model:
-        logging.info("Loading best-valid-loss.py")
-        load_checkpoint(f"{params.exp_dir}/best-valid-loss.pt", model)
-    elif not params.use_averaged_model:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-        elif params.avg == 1:
-            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-        else:
-            start = params.epoch - params.avg + 1
-            filenames = []
-            for i in range(start, params.epoch + 1):
-                if i >= 1:
-                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-    else:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg + 1
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg + 1:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            filename_start = filenames[-1]
-            filename_end = filenames[0]
-            logging.info(
-                "Calculating the averaged model over iteration checkpoints"
-                f" from {filename_start} (excluded) to {filename_end}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-        else:
-            assert params.avg > 0, params.avg
-            start = params.epoch - params.avg
-            assert start >= 1, start
-            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-            logging.info(
-                f"Calculating the averaged model over epoch range from "
-                f"{start} (excluded) to {params.epoch}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-
-    model.to(device)
-    model.eval()
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    rnn_lm_model = None
-    if params.method == "rnn-lm":
-        rnn_lm_model = RnnLmModel(
-            vocab_size=params.num_classes,
-            embedding_dim=params.rnn_lm_embedding_dim,
-            hidden_dim=params.rnn_lm_hidden_dim,
-            num_layers=params.rnn_lm_num_layers,
-            tie_weights=params.rnn_lm_tie_weights,
-        )
-        if params.rnn_lm_avg == 1:
-            load_checkpoint(
-                f"{params.rnn_lm_exp_dir}/epoch-{params.rnn_lm_epoch}.pt",
-                rnn_lm_model,
-            )
-            rnn_lm_model.to(device)
-        else:
-            rnn_lm_model = load_averaged_model(
-                params.rnn_lm_exp_dir,
-                rnn_lm_model,
-                params.rnn_lm_epoch,
-                params.rnn_lm_avg,
-                device,
-            )
-        rnn_lm_model.eval()
-
-    # we need cut ids to display recognition results.
-    args.return_cuts = True
-    librispeech = LibriSpeechAsrDataModule(args)
-
-    dev_clean_cuts = librispeech.dev_clean_cuts()
-    dev_other_cuts = librispeech.dev_other_cuts()
-    test_clean_cuts = librispeech.test_clean_cuts()
-    test_other_cuts = librispeech.test_other_cuts()
-
-    dev_clean_dl = librispeech.test_dataloaders(dev_clean_cuts)
-    dev_other_dl = librispeech.test_dataloaders(dev_other_cuts)
-    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
-    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
-
-    test_sets = ["test-clean", "test-other"]
-    #test_sets = ["test-clean"]
-    test_dl = [test_clean_dl , test_other_dl]
-    #test_dl = [test_clean_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dl):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            model=model,
-            rnn_lm_model=rnn_lm_model,
-            HLG=HLG,
-            H=H,
-            bpe_model=bpe_model,
-            word_table=lexicon.word_table,
-            G=G,
-            sos_id=sos_id,
-            eos_id=eos_id,
-        )
-
-        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
-    logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/librispeech/WSASR/conformer_ctc2/train_lexicon_new.py b/egs/librispeech/WSASR/conformer_ctc2/train_lexicon_new.py
deleted file mode 100755
index bc472d142..000000000
--- a/egs/librispeech/WSASR/conformer_ctc2/train_lexicon_new.py
+++ /dev/null
@@ -1,1177 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
-#                                                  Wei Kang,
-#                                                  Mingshuang Luo,
-#                                                  Zengwei Yao,
-#                                                  Quandong Wang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./conformer_ctc2/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 1 \
-  --exp-dir conformer_ctc2/exp \
-  --full-libri 1 \
-  --max-duration 300
-
-# For mix precision training:
-
-./conformer_ctc2/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 1 \
-  --use-fp16 1 \
-  --exp-dir conformer_ctc2/exp \
-  --full-libri 1 \
-  --max-duration 550
-
-"""
-import sys
-
-import argparse
-import copy
-import logging
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple, Union
-
-import k2
-import optim
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
-from conformer import Conformer
-from lhotse.cut import Cut
-from lhotse.dataset.sampling.base import CutSampler
-from lhotse.utils import fix_random_seed
-from optim import Eden, Eve
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.utils.tensorboard import SummaryWriter
-
-from icefall import diagnostics
-from icefall.checkpoint import load_checkpoint, remove_checkpoints
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import (
-    save_checkpoint_with_global_batch_idx,
-    update_averaged_model,
-)
-from icefall.dist import cleanup_dist, setup_dist
-from icefall.env import get_env_info
-#from icefall.otc_lexicon_graph_compiler import OtcTrainingGraphCompiler
-from icefall.otc_graph_compiler import OtcTrainingGraphCompiler
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    MetricsTracker,
-    get_texts,
-    encode_supervisions_otc,
-    setup_logger,
-    str2bool,
-)
-from icefall.decode import one_best_decoding
-
-LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--world-size", type=int, default=1, help="Number of GPUs for DDP training.",
-    )
-
-    parser.add_argument(
-        "--master-port",
-        type=int,
-        default=12354,
-        help="Master port to use for DDP training.",
-    )
-
-    parser.add_argument(
-        "--tensorboard",
-        type=str2bool,
-        default=True,
-        help="Should various information be logged in tensorboard.",
-    )
-
-    parser.add_argument(
-        "--num-epochs", type=int, default=30, help="Number of epochs to train.",
-    )
-
-    parser.add_argument(
-        "--start-epoch",
-        type=int,
-        default=1,
-        help="""Resume training from this epoch. It should be positive.
-        If larger than 1, it will load checkpoint from
-        exp-dir/epoch-{start_epoch-1}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--start-batch",
-        type=int,
-        default=0,
-        help="""If positive, --start-epoch is ignored and
-        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="conformer_ctc2/exp",
-        help="""The experiment dir.
-        It specifies the directory where all training related
-        files, e.g., checkpoints, log, etc, are saved
-        """,
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        default="data/lang_bpe_500",
-        help="""The lang dir
-        It contains language related input files such as
-        "lexicon.txt"
-        """,
-    )
-
-    parser.add_argument(
-        "--initial-lr",
-        type=float,
-        default=0.003,
-        help="""The initial learning rate. This value should not need to be
-        changed.""",
-    )
-
-    parser.add_argument(
-        "--lr-batches",
-        type=float,
-        default=5000,
-        help="""Number of steps that affects how rapidly the learning rate decreases.
-        We suggest not to change this.""",
-    )
-
-    parser.add_argument(
-        "--lr-epochs",
-        type=float,
-        default=6,
-        help="""Number of epochs that affects how rapidly the learning rate decreases.
-        """,
-    )
-
-    parser.add_argument(
-        "--att-rate",
-        type=float,
-        default=0.0,
-        help="""The attention rate.
-        The total loss is (1 -  att_rate) * ctc_loss + att_rate * att_loss
-        """,
-    )
-
-    parser.add_argument(
-        "--num-decoder-layers",
-        type=int,
-        default=0,
-        help="""Number of decoder layer of transformer decoder.
-        Setting this to 0 will not create the decoder at all (pure CTC model)
-        """,
-    )
-
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="The seed for random generators intended for reproducibility",
-    )
-
-    parser.add_argument(
-        "--print-diagnostics",
-        type=str2bool,
-        default=False,
-        help="Accumulate stats on activations, print them and exit.",
-    )
-
-    parser.add_argument(
-        "--save-every-n",
-        type=int,
-        default=8000,
-        help="""Save checkpoint after processing this number of batches"
-        periodically. We save checkpoint to exp-dir/ whenever
-        params.batch_idx_train % save_every_n == 0. The checkpoint filename
-        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
-        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
-        end of each epoch where `xxx` is the epoch number counting from 0.
-        """,
-    )
-
-    parser.add_argument(
-        "--keep-last-k",
-        type=int,
-        default=5,
-        help="""Only keep this number of checkpoints on disk.
-        For instance, if it is 3, there are only 3 checkpoints
-        in the exp-dir with filenames `checkpoint-xxx.pt`.
-        It does not affect checkpoints with name `epoch-xxx.pt`.
-        """,
-    )
-
-    parser.add_argument(
-        "--average-period",
-        type=int,
-        default=100,
-        help="""Update the averaged model, namely `model_avg`, after processing
-        this number of batches. `model_avg` is a separate version of model,
-        in which each floating-point parameter is the average of all the
-        parameters from the start of training. Each time we take the average,
-        we do: `model_avg = model * (average_period / batch_idx_train) +
-            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
-        """,
-    )
-
-    parser.add_argument(
-        "--use-fp16",
-        type=str2bool,
-        default=False,
-        help="Whether to use half precision training.",
-    )
-
-    parser.add_argument(
-        "--wasr-type", type=str, default="star",
-    )
-
-    parser.add_argument(
-        "--wasr-token", type=str, default="<star>",
-    )
-
-    parser.add_argument(
-        "--allow-bypass", type=str2bool, default=True,
-    )
-
-    parser.add_argument(
-        "--allow-self-loop", type=str2bool, default=True,
-    )
-
-    parser.add_argument(
-        "--initial-bypass-penalty", type=float, default=0,
-    )
-
-    parser.add_argument(
-        "--initial-self-loop-penalty", type=float, default=0,
-    )
-
-    parser.add_argument(
-        "--min-penalty", type=float, default=0,
-    )
-
-    parser.add_argument(
-        "--bypass-penalty-decay", type=float, default=1.0,
-    )
-
-    parser.add_argument(
-        "--self-loop-penalty-decay", type=float, default=1.0,
-    )
-
-    parser.add_argument(
-        "--show-alignment", type=str2bool, default=False,
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    """Return a dict containing training parameters.
-
-    All training related parameters that are not passed from the commandline
-    are saved in the variable `params`.
-
-    Commandline options are merged into `params` after they are parsed, so
-    you can also access them via `params`.
-
-    Explanation of options saved in `params`:
-
-        - best_train_loss: Best training loss so far. It is used to select
-                           the model that has the lowest training loss. It is
-                           updated during the training.
-
-        - best_valid_loss: Best validation loss so far. It is used to select
-                           the model that has the lowest validation loss. It is
-                           updated during the training.
-
-        - best_train_epoch: It is the epoch that has the best training loss.
-
-        - best_valid_epoch: It is the epoch that has the best validation loss.
-
-        - batch_idx_train: Used to writing statistics to tensorboard. It
-                           contains number of batches trained so far across
-                           epochs.
-
-        - log_interval:  Print training loss if batch_idx % log_interval` is 0
-
-        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
-        - valid_interval:  Run validation if batch_idx % valid_interval is 0
-
-        - feature_dim: The model input dim. It has to match the one used
-                       in computing features.
-
-        - subsampling_factor:  The subsampling factor for the model.
-
-        - encoder_dim: Hidden dim for multi-head attention model.
-
-        - num_decoder_layers: Number of decoder layer of transformer decoder.
-
-        - beam_size: It is used in k2.ctc_loss
-
-        - reduction: It is used in k2.ctc_loss
-
-        - use_double_scores: It is used in k2.ctc_loss
-
-        - warm_step: The warm_step for Noam optimizer.
-    """
-    params = AttributeDict(
-        {
-            "best_train_loss": float("inf"),
-            "best_valid_loss": float("inf"),
-            "best_train_epoch": -1,
-            "best_valid_epoch": -1,
-            "batch_idx_train": 0,
-            "log_interval": 1,
-            "reset_interval": 200,
-            "valid_interval": 800,  # For the 100h subset, use 800
-            # parameters for conformer
-            "feature_dim": 768,
-            "subsampling_factor": 2,
-            "encoder_dim": 512,
-            "nhead": 8,
-            "dim_feedforward": 2048,
-            "num_encoder_layers": 12,
-            # parameters for ctc loss
-            "beam_size": 10,
-            "reduction": "sum",
-            "use_double_scores": True,
-            # parameters for Noam
-            "model_warm_step": 3000,  # arg given to model, not for lrate
-            "env_info": get_env_info(),
-        }
-    )
-
-    return params
-
-
-def load_checkpoint_if_available(
-    params: AttributeDict,
-    model: nn.Module,
-    model_avg: nn.Module = None,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[LRSchedulerType] = None,
-) -> Optional[Dict[str, Any]]:
-    """Load checkpoint from file.
-
-    If params.start_batch is positive, it will load the checkpoint from
-    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
-    params.start_epoch is larger than 1, it will load the checkpoint from
-    `params.start_epoch - 1`.
-
-    Apart from loading state dict for `model` and `optimizer` it also updates
-    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
-    and `best_valid_loss` in `params`.
-
-    Args:
-      params:
-        The return value of :func:`get_params`.
-      model:
-        The training model.
-      model_avg:
-        The stored model averaged from the start of training.
-      optimizer:
-        The optimizer that we are using.
-      scheduler:
-        The scheduler that we are using.
-    Returns:
-      Return a dict containing previously saved training info.
-    """
-    if params.start_batch > 0:
-        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
-    elif params.start_epoch > 1:
-        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
-    else:
-        return None
-
-    assert filename.is_file(), f"{filename} does not exist!"
-
-    saved_params = load_checkpoint(
-        filename,
-        model=model,
-        model_avg=model_avg,
-        optimizer=optimizer,
-        scheduler=scheduler,
-    )
-
-    keys = [
-        "best_train_epoch",
-        "best_valid_epoch",
-        "batch_idx_train",
-        "best_train_loss",
-        "best_valid_loss",
-    ]
-    for k in keys:
-        params[k] = saved_params[k]
-
-    if params.start_batch > 0:
-        if "cur_epoch" in saved_params:
-            params["start_epoch"] = saved_params["cur_epoch"]
-
-        if "cur_batch_idx" in saved_params:
-            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
-
-    return saved_params
-
-
-def save_checkpoint(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    model_avg: Optional[nn.Module] = None,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[LRSchedulerType] = None,
-    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
-    rank: int = 0,
-) -> None:
-    """Save model, optimizer, scheduler and training stats to file.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The training model.
-      model_avg:
-        The stored model averaged from the start of training.
-      optimizer:
-        The optimizer used in the training.
-      sampler:
-       The sampler for the training dataset.
-      scaler:
-        The scaler used for mix precision training.
-    """
-    if rank != 0:
-        return
-    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
-    save_checkpoint_impl(
-        filename=filename,
-        model=model,
-        model_avg=model_avg,
-        params=params,
-        optimizer=optimizer,
-        scheduler=scheduler,
-        sampler=sampler,
-        scaler=scaler,
-        rank=rank,
-    )
-
-    if params.best_train_epoch == params.cur_epoch:
-        best_train_filename = params.exp_dir / "best-train-loss.pt"
-        copyfile(src=filename, dst=best_train_filename)
-
-    if params.best_valid_epoch == params.cur_epoch:
-        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
-        copyfile(src=filename, dst=best_valid_filename)
-
-
-def compute_loss(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    batch: dict,
-    graph_compiler: OtcTrainingGraphCompiler,
-    is_training: bool,
-    warmup: float = 2.0,
-    bypass_penalty: float = 0.0,
-    self_loop_penalty: float = 0.0,
-) -> Tuple[Tensor, MetricsTracker]:
-    """
-    Compute CTC loss given the model and its inputs.
-
-    Args:
-      params:
-        Parameters for training. See :func:`get_params`.
-      model:
-        The model for training. It is an instance of Conformer in our case.
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      graph_compiler:
-        It is used to build a decoding graph from a ctc topo and training
-        transcript. The training transcript is contained in the given `batch`,
-        while the ctc topo is built when this compiler is instantiated.
-      is_training:
-        True for training. False for validation. When it is True, this
-        function enables autograd during computation; when it is False, it
-        disables autograd.
-     warmup: a floating point value which increases throughout training;
-        values >= 1.0 are fully warmed up and have all modules present.
-    """
-    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
-    feature = batch["inputs"]
-    # at entry, feature is (N, T, C)
-    assert feature.ndim == 3
-    feature = feature.to(device)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    with torch.set_grad_enabled(is_training):
-        nnet_output, encoder_memory, memory_mask = model(
-            feature, supervisions, warmup=warmup
-        )
-        # Note: we assume <star> is the last symbol in token list (tokens.txt)
-        _, _, C = nnet_output.shape
-        star_log_prob = torch.logsumexp(
-            nnet_output[:, :, 1:], dim=-1, keepdim=True
-        ) - torch.log(torch.tensor([C - 1])).to(device)
-        nnet_output = torch.cat([nnet_output, star_log_prob], dim=-1)
-
-    # NOTE: We need `encode_supervisions` to sort sequences with
-    # different duration in decreasing order, required by
-    # `k2.intersect_dense` called in `k2.ctc_loss`
-    supervision_segments, texts, ids, orig_texts = encode_supervisions_otc(
-        supervisions, subsampling_factor=params.subsampling_factor
-    )
-    decoding_graph = graph_compiler.compile(
-        texts=texts,
-        allow_bypass_arc=params.allow_bypass,
-        allow_self_loop_arc=params.allow_self_loop,
-        bypass_weight=bypass_penalty,
-        self_loop_weight=self_loop_penalty,
-    )
-
-    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision_segments, allow_truncate=3,)
-
-    ctc_loss = k2.ctc_loss(
-        decoding_graph=decoding_graph,
-        dense_fsa_vec=dense_fsa_vec,
-        output_beam=params.beam_size,
-        reduction=params.reduction,
-        use_double_scores=params.use_double_scores,
-    )
-
-    if params.att_rate != 0.0:
-        raise ValueError("not supported")
-    #        with torch.set_grad_enabled(is_training):
-    #            mmodel = model.module if hasattr(model, "module") else model
-    #            # Note: We need to generate an unsorted version of token_ids
-    #            # `encode_supervisions()` called above sorts text, but
-    #            # encoder_memory and memory_mask are not sorted, so we
-    #            # use an unsorted version `supervisions["text"]` to regenerate
-    #            # the token_ids
-    #            #
-    #            # See https://github.com/k2-fsa/icefall/issues/97
-    #            # for more details
-    #            unsorted_token_ids = graph_compiler.texts_to_ids(supervisions["text"])
-    #            att_loss = mmodel.decoder_forward(
-    #                encoder_memory,
-    #                memory_mask,
-    #                token_ids=unsorted_token_ids,
-    #                sos_id=graph_compiler.sos_id,
-    #                eos_id=graph_compiler.eos_id,
-    #            )
-    #        loss = (1.0 - params.att_rate) * ctc_loss + params.att_rate * att_loss
-    else:
-        loss = ctc_loss
-        att_loss = torch.tensor([0])
-
-    assert loss.requires_grad == is_training
-
-    info = MetricsTracker()
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
-    info["ctc_loss"] = ctc_loss.detach().cpu().item()
-    if params.att_rate != 0.0:
-        info["att_loss"] = att_loss.detach().cpu().item()
-
-    # Note: We use reduction=sum while computing the loss.
-    info["loss"] = loss.detach().cpu().item()
-
-    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
-    info["utterances"] = feature.size(0)
-    # averaged input duration in frames over utterances
-    info["utt_duration"] = feature_lens.sum().item()
-    # averaged padding proportion over utterances
-    info["utt_pad_proportion"] = (
-        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
-    )
-
-    if params.show_alignment:
-        for index, id in enumerate(ids):
-            if id.startswith("103-1240"):
-                ref_text = orig_texts[index]
-                lattice = k2.intersect_dense(
-                    decoding_graph, dense_fsa_vec, params.beam_size,
-                )
-                best_path = one_best_decoding(
-                    lattice=lattice, use_double_scores=params.use_double_scores,
-                )
-                hyp = get_texts(best_path)[index]
-                hyp_text_list = [graph_compiler.token_table[i] for i in hyp]
-                hyp_text = " ".join(hyp_text_list)
-                #hyp_text = graph_compiler.sp.decode(hyp_text_list)
-
-                logging.info(f"[utt]: {id}")
-                logging.info(f"[ref]: {ref_text}")
-                logging.info(f"[ali]: {hyp_text}")
-
-        return loss, info
-
-
-def compute_validation_loss(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    graph_compiler: OtcTrainingGraphCompiler,
-    valid_dl: torch.utils.data.DataLoader,
-    world_size: int = 1,
-) -> MetricsTracker:
-    """Run the validation process."""
-    model.eval()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(valid_dl):
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            batch=batch,
-            graph_compiler=graph_compiler,
-            is_training=False,
-        )
-        assert loss.requires_grad is False
-        tot_loss = tot_loss + loss_info
-
-    if world_size > 1:
-        tot_loss.reduce(loss.device)
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    if loss_value < params.best_valid_loss:
-        params.best_valid_epoch = params.cur_epoch
-        params.best_valid_loss = loss_value
-
-    return tot_loss
-
-
-def train_one_epoch(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    optimizer: torch.optim.Optimizer,
-    graph_compiler: OtcTrainingGraphCompiler,
-    scheduler: LRSchedulerType,
-    train_dl: torch.utils.data.DataLoader,
-    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
-    model_avg: Optional[nn.Module] = None,
-    tb_writer: Optional[SummaryWriter] = None,
-    world_size: int = 1,
-    rank: int = 0,
-    bypass_penalty: float = 0,
-    self_loop_penalty: float = 0,
-) -> None:
-    """Train the model for one epoch.
-
-    The training loss from the mean of all frames is saved in
-    `params.train_loss`. It runs the validation process every
-    `params.valid_interval` batches.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The model for training.
-      optimizer:
-        The optimizer we are using.
-      graph_compiler:
-        It is used to convert transcripts to FSAs.
-      scheduler:
-        The learning rate scheduler, we call step() every step.
-      train_dl:
-        Dataloader for the training dataset.
-      valid_dl:
-        Dataloader for the validation dataset.
-      scaler:
-        The scaler used for mix precision training.
-      model_avg:
-        The stored model averaged from the start of training.
-      tb_writer:
-        Writer to write log messages to tensorboard.
-      world_size:
-        Number of nodes in DDP training. If it is 1, DDP is disabled.
-      rank:
-        The rank of the node in DDP training. If no DDP is used, it should
-        be set to 0.
-    """
-    model.train()
-
-    tot_loss = MetricsTracker()
-
-    cur_batch_idx = params.get("cur_batch_idx", 0)
-
-    for batch_idx, batch in enumerate(train_dl):
-        if batch_idx < cur_batch_idx:
-            continue
-        cur_batch_idx = batch_idx
-
-        params.batch_idx_train += 1
-        batch_size = len(batch["supervisions"]["text"])
-        # batch_name = batch["supervisions"]["uttid"]
-        batch_name = "fake"
-
-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
-            loss, loss_info = compute_loss(
-                params=params,
-                model=model,
-                batch=batch,
-                graph_compiler=graph_compiler,
-                is_training=True,
-                warmup=(params.batch_idx_train / params.model_warm_step),
-                bypass_penalty=bypass_penalty,
-                self_loop_penalty=self_loop_penalty,
-            )
-        # summary stats
-        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-        # NOTE: We use reduction==sum and loss is computed over utterances
-        # in the batch and there is no normalization to it so far.
-        # scaler.scale(loss).backward()
-
-        try:
-            # loss.backward()
-            scaler.scale(loss).backward()
-        except RuntimeError as e:
-            if "CUDA out of memory" in str(e):
-                logging.error(
-                    f"failing batch size:{batch_size} "
-                    f"failing batch names {batch_name}"
-                )
-            raise
-
-        scheduler.step_batch(params.batch_idx_train)
-        scaler.step(optimizer)
-        scaler.update()
-        optimizer.zero_grad()
-
-        if params.print_diagnostics and batch_idx == 30:
-            return
-
-        if (
-            rank == 0
-            and params.batch_idx_train > 0
-            and params.batch_idx_train % params.average_period == 0
-        ):
-            update_averaged_model(
-                params=params, model_cur=model, model_avg=model_avg,
-            )
-
-        if (
-            params.batch_idx_train > 0
-            and params.batch_idx_train % params.save_every_n == 0
-        ):
-            params.cur_batch_idx = batch_idx
-            save_checkpoint_with_global_batch_idx(
-                out_dir=params.exp_dir,
-                global_batch_idx=params.batch_idx_train,
-                model=model,
-                model_avg=model_avg,
-                params=params,
-                optimizer=optimizer,
-                scheduler=scheduler,
-                sampler=train_dl.sampler,
-                scaler=scaler,
-                rank=rank,
-            )
-            del params.cur_batch_idx
-            remove_checkpoints(
-                out_dir=params.exp_dir, topk=params.keep_last_k, rank=rank,
-            )
-
-        if batch_idx % params.log_interval == 0:
-            cur_lr = scheduler.get_last_lr()[0]
-            logging.info(
-                f"Epoch {params.cur_epoch}, "
-                f"batch {batch_idx}, loss[{loss_info}], "
-                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
-                f"lr: {cur_lr:.2e}"
-            )
-            if loss_info["ctc_loss"] == float("inf") or loss_info["att_loss"] == float(
-                "inf"
-            ):
-                logging.error(
-                    "Your loss contains inf, something goes wrong"
-                    f"failing batch names {batch_name}"
-                )
-            if tb_writer is not None:
-                tb_writer.add_scalar(
-                    "train/learning_rate", cur_lr, params.batch_idx_train
-                )
-
-                loss_info.write_summary(
-                    tb_writer, "train/current_", params.batch_idx_train
-                )
-                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
-
-        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
-            logging.info("Computing validation loss")
-            valid_info = compute_validation_loss(
-                params=params,
-                model=model,
-                graph_compiler=graph_compiler,
-                valid_dl=valid_dl,
-                world_size=world_size,
-            )
-            model.train()
-            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
-            if tb_writer is not None:
-                valid_info.write_summary(
-                    tb_writer, "train/valid_", params.batch_idx_train
-                )
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    params.train_loss = loss_value
-    if params.train_loss < params.best_train_loss:
-        params.best_train_epoch = params.cur_epoch
-        params.best_train_loss = params.train_loss
-
-
-def run(rank, world_size, args):
-    """
-    Args:
-      rank:
-        It is a value between 0 and `world_size-1`, which is
-        passed automatically by `mp.spawn()` in :func:`main`.
-        The node with rank 0 is responsible for saving checkpoint.
-      world_size:
-        Number of GPUs for DDP training.
-      args:
-        The return value of get_parser().parse_args()
-    """
-    params = get_params()
-    params.update(vars(args))
-    if params.full_libri is False:
-        params.valid_interval = 1600
-
-    fix_random_seed(params.seed)
-    if world_size > 1:
-        setup_dist(rank, world_size, params.master_port)
-
-    setup_logger(f"{params.exp_dir}/log/log-train")
-    logging.info("Training started")
-    logging.info(params)
-
-    if args.tensorboard and rank == 0:
-        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
-    else:
-        tb_writer = None
-
-    lexicon = Lexicon(params.lang_dir)
-    # remove <star> which will be assembled later in nnet_output
-    max_token_id = max(lexicon.tokens) - 1
-    # add blank
-    num_classes = max_token_id + 1
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", rank)
-
-    #    if "lang_bpe" in str(params.lang_dir):
-    #        graph_compiler = BpeCtcTrainingGraphCompiler(
-    #            params.lang_dir,
-    #            device=device,
-    #            sos_token="<sos/eos>",
-    #            eos_token="<sos/eos>",
-    #        )
-    #    elif "lang_phone" in str(params.lang_dir):
-    #        assert params.att_rate == 0, (
-    #            "Attention decoder training does not support phone lang dirs "
-    #            "at this time due to a missing <sos/eos> symbol. Set --att-rate=0 "
-    #            "for pure CTC training when using a phone-based lang dir."
-    #        )
-    #        assert params.num_decoder_layers == 0, (
-    #            "Attention decoder training does not support phone lang dirs "
-    #            "at this time due to a missing <sos/eos> symbol. "
-    #            "Set --num-decoder-layers=0 for pure CTC training when using "
-    #            "a phone-based lang dir."
-    #        )
-    #        graph_compiler = CtcTrainingGraphCompiler(
-    #            lexicon,
-    #            device=device,
-    #        )
-    #        # Manually add the sos/eos ID with their default values
-    #        # from the BPE recipe which we're adapting here.
-    #        graph_compiler.sos_id = 1
-    #        graph_compiler.eos_id = 1
-    #    else:
-    #        raise ValueError(
-    #            f"Unsupported type of lang dir (we expected it to have "
-    #            f"'lang_bpe' or 'lang_phone' in its name): {params.lang_dir}"
-    #        )
-    graph_compiler = OtcTrainingGraphCompiler(
-        lang_dir=params.lang_dir,
-        otc_token = "▁<star>",
-        device=device,
-    )
-
-    logging.info("About to create model")
-    model = Conformer(
-        num_features=params.feature_dim,
-        nhead=params.nhead,
-        d_model=params.encoder_dim,
-        num_classes=num_classes,
-        subsampling_factor=params.subsampling_factor,
-        num_encoder_layers=params.num_encoder_layers,
-        num_decoder_layers=params.num_decoder_layers,
-    )
-
-    print(model)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    assert params.save_every_n >= params.average_period
-    model_avg: Optional[nn.Module] = None
-    if rank == 0:
-        # model_avg is only used with rank 0
-        model_avg = copy.deepcopy(model)
-
-    assert params.start_epoch > 0, params.start_epoch
-    checkpoints = load_checkpoint_if_available(
-        params=params, model=model, model_avg=model_avg
-    )
-
-    model.to(device)
-    if world_size > 1:
-        logging.info("Using DDP")
-        model = DDP(model, device_ids=[rank])
-
-    optimizer = Eve(model.parameters(), lr=params.initial_lr)
-
-    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
-
-    if checkpoints and "optimizer" in checkpoints:
-        logging.info("Loading optimizer state dict")
-        optimizer.load_state_dict(checkpoints["optimizer"])
-
-    if (
-        checkpoints
-        and "scheduler" in checkpoints
-        and checkpoints["scheduler"] is not None
-    ):
-        logging.info("Loading scheduler state dict")
-        scheduler.load_state_dict(checkpoints["scheduler"])
-
-    if params.print_diagnostics:
-        diagnostic = diagnostics.attach_diagnostics(model)
-
-    librispeech = LibriSpeechAsrDataModule(args)
-
-    if params.full_libri:
-        train_cuts = librispeech.train_all_shuf_cuts()
-    else:
-        train_cuts = librispeech.train_clean_100_cuts()
-
-    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 20 seconds
-        #
-        # Caution: There is a reason to select 20.0 here. Please see
-        # ../local/display_manifest_statistics.py
-        #
-        # You should use ../local/display_manifest_statistics.py to get
-        # an utterance duration distribution for your dataset to select
-        # the threshold
-        return 1.0 <= c.duration <= 20.0
-
-    def remove_invalid_utt_ctc(c: Cut):
-        # Caution: We assume the subsampling factor is 4!
-        # num_tokens = len(sp.encode(c.supervisions[0].text, out_type=int))
-        num_tokens = len(graph_compiler.texts_to_ids(c.supervisions[0].text))
-        min_output_input_ratio = 0.0005
-        max_output_input_ratio = 0.1
-        return (
-            min_output_input_ratio
-            < num_tokens / float(c.features.num_frames)
-            < max_output_input_ratio
-        )
-
-    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-    #    train_cuts = train_cuts.filter(remove_invalid_utt_ctc)
-
-    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
-        # We only load the sampler's state dict when it loads a checkpoint
-        # saved in the middle of an epoch
-        sampler_state_dict = checkpoints["sampler"]
-    else:
-        sampler_state_dict = None
-
-    train_dl = librispeech.train_dataloaders(
-        train_cuts, sampler_state_dict=sampler_state_dict
-    )
-
-    valid_cuts = librispeech.dev_clean_cuts()
-    valid_cuts += librispeech.dev_other_cuts()
-    valid_dl = librispeech.valid_dataloaders(valid_cuts)
-
-    if params.print_diagnostics:
-        scan_pessimistic_batches_for_oom(
-            model=model,
-            train_dl=train_dl,
-            optimizer=optimizer,
-            graph_compiler=graph_compiler,
-            params=params,
-        )
-
-    scaler = GradScaler(enabled=params.use_fp16)
-    if checkpoints and "grad_scaler" in checkpoints:
-        logging.info("Loading grad scaler state dict")
-        scaler.load_state_dict(checkpoints["grad_scaler"])
-
-    bypass_penalty = params.initial_bypass_penalty
-    self_loop_penalty = params.initial_self_loop_penalty
-    for epoch in range(params.start_epoch, params.num_epochs + 1):
-        scheduler.step_epoch(epoch - 1)
-        fix_random_seed(params.seed + epoch - 1)
-        train_dl.sampler.set_epoch(epoch - 1)
-
-        if tb_writer is not None:
-            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
-        params.cur_epoch = epoch
-
-        logging.info(
-            f"bypass penalty: {bypass_penalty}, decay: {params.bypass_penalty_decay}"
-        )
-        logging.info(
-            f"self loop penalty: {self_loop_penalty}, decay: {params.self_loop_penalty_decay}"
-        )
-        train_one_epoch(
-            params=params,
-            model=model,
-            model_avg=model_avg,
-            optimizer=optimizer,
-            graph_compiler=graph_compiler,
-            scheduler=scheduler,
-            train_dl=train_dl,
-            valid_dl=valid_dl,
-            scaler=scaler,
-            tb_writer=tb_writer,
-            world_size=world_size,
-            rank=rank,
-            bypass_penalty=bypass_penalty,
-            self_loop_penalty=self_loop_penalty,
-        )
-        bypass_penalty *= params.bypass_penalty_decay
-        self_loop_penalty *= params.self_loop_penalty_decay
-        # params.penalty_decay =  params.penalty_decay ** 0.5
-
-        if params.print_diagnostics:
-            diagnostic.print_diagnostics()
-            break
-
-        save_checkpoint(
-            params=params,
-            model=model,
-            model_avg=model_avg,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            sampler=train_dl.sampler,
-            scaler=scaler,
-            rank=rank,
-        )
-
-    logging.info("Done!")
-
-    if world_size > 1:
-        torch.distributed.barrier()
-        cleanup_dist()
-
-
-def scan_pessimistic_batches_for_oom(
-    model: Union[nn.Module, DDP],
-    train_dl: torch.utils.data.DataLoader,
-    optimizer: torch.optim.Optimizer,
-    graph_compiler: OtcTrainingGraphCompiler,
-    params: AttributeDict,
-):
-    from lhotse.dataset import find_pessimistic_batches
-
-    logging.info(
-        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
-    )
-    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
-    for criterion, cuts in batches.items():
-        batch = train_dl.dataset[cuts]
-        try:
-            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
-            # (i.e. are not remembered by the decaying-average in adam), because
-            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
-                loss, _ = compute_loss(
-                    params=params,
-                    model=model,
-                    batch=batch,
-                    graph_compiler=graph_compiler,
-                    is_training=True,
-                    warmup=0.0,
-                )
-            loss.backward()
-            optimizer.step()
-            optimizer.zero_grad()
-        except RuntimeError as e:
-            if "CUDA out of memory" in str(e):
-                logging.error(
-                    "Your GPU ran out of memory with the current "
-                    "max_duration setting. We recommend decreasing "
-                    "max_duration and trying again.\n"
-                    f"Failing criterion: {criterion} "
-                    f"(={crit_values[criterion]}) ..."
-                )
-            raise
-
-
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    world_size = args.world_size
-    assert world_size >= 1
-    if world_size > 1:
-        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
-    else:
-        run(rank=0, world_size=1, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()