remove conformer ctc; minor fixes in RNN-T

2025-12-11 06:55:27 +00:00 · 2022-05-12 22:02:08 -04:00 · 2022-05-12 22:02:08 -04:00 · f62f8fba20
commit f62f8fba20
parent 4e1205a644
16 changed files with 432 additions and 5132 deletions
--- a/egs/spgispeech/ASR/conformer_ctc/README.md
+++ b/egs/spgispeech/ASR/conformer_ctc/README.md
@ -1,75 +0,0 @@
-## Introduction
-
-Please visit
-<https://icefall.readthedocs.io/en/latest/recipes/librispeech/conformer_ctc.html>
-for how to run this recipe.
-
-## How to compute framewise alignment information
-
-### Step 1: Train a model
-
-Please use `conformer_ctc/train.py` to train a model.
-See <https://icefall.readthedocs.io/en/latest/recipes/librispeech/conformer_ctc.html>
-for how to do it.
-
-### Step 2: Compute framewise alignment
-
-Run
-
-```
-# Choose a checkpoint and determine the number of checkpoints to average
-epoch=30
-avg=15
-./conformer_ctc/ali.py \
-  --epoch $epoch \
-  --avg $avg \
-  --max-duration 500 \
-  --bucketing-sampler 0 \
-  --full-libri 1 \
-  --exp-dir conformer_ctc/exp \
-  --lang-dir data/lang_bpe_500 \
-  --ali-dir data/ali_500
-```
-and  you will get four files inside the folder `data/ali_500`:
-
-```
-$ ls -lh data/ali_500
-total 546M
-rw-r--r-- 1 kuangfangjun root 1.1M Sep 28 08:06 test_clean.pt
-rw-r--r-- 1 kuangfangjun root 1.1M Sep 28 08:07 test_other.pt
-rw-r--r-- 1 kuangfangjun root 542M Sep 28 11:36 train-960.pt
-rw-r--r-- 1 kuangfangjun root 2.1M Sep 28 11:38 valid.pt
-```
-
-**Note**: It can take more than 3 hours to compute the alignment
-for the training dataset, which contains 960 * 3 = 2880 hours of data.
-
-**Caution**: The model parameters in `conformer_ctc/ali.py` have to match those
-in `conformer_ctc/train.py`.
-
-**Caution**: You have to set the parameter `preserve_id` to `True` for `CutMix`.
-Search `./conformer_ctc/asr_datamodule.py` for `preserve_id`.
-
-### Step 3: Check your extracted alignments
-
-There is a file `test_ali.py` in `icefall/test` that can be used to test your
-alignments. It uses pre-computed alignments to modify a randomly generated
-`nnet_output` and it checks that we can decode the correct transcripts
-from the resulting `nnet_output`.
-
-You should get something like the following if you run that script:
-
-```
-$ ./test/test_ali.py
-['THE GOOD NATURED AUDIENCE IN PITY TO FALLEN MAJESTY SHOWED FOR ONCE GREATER DEFERENCE TO THE KING THAN TO THE MINISTER AND SUNG THE PSALM WHICH THE FORMER HAD CALLED FOR', 'THE OLD SERVANT TOLD HIM QUIETLY AS THEY CREPT BACK TO DWELL THAT THIS PASSAGE THAT LED FROM THE HUT IN THE PLEASANCE TO SHERWOOD AND THAT GEOFFREY FOR THE TIME WAS HIDING WITH THE OUTLAWS IN THE FOREST', 'FOR A WHILE SHE LAY IN HER CHAIR IN HAPPY DREAMY PLEASURE AT SUN AND BIRD AND TREE', "BUT THE ESSENCE OF LUTHER'S LECTURES IS THERE"]
-['THE GOOD NATURED AUDIENCE IN PITY TO FALLEN MAJESTY SHOWED FOR ONCE GREATER DEFERENCE TO THE KING THAN TO THE MINISTER AND SUNG THE PSALM WHICH THE FORMER HAD CALLED FOR', 'THE OLD SERVANT TOLD HIM QUIETLY AS THEY CREPT BACK TO GAMEWELL THAT THIS PASSAGE WAY LED FROM THE HUT IN THE PLEASANCE TO SHERWOOD AND THAT GEOFFREY FOR THE TIME WAS HIDING WITH THE OUTLAWS IN THE FOREST', 'FOR A WHILE SHE LAY IN HER CHAIR IN HAPPY DREAMY PLEASURE AT SUN AND BIRD AND TREE', "BUT THE ESSENCE OF LUTHER'S LECTURES IS THERE"]
-```
-
-### Step 4: Use your alignments in training
-
-Please refer to `conformer_mmi/train.py` for usage. Some useful
-functions are:
-
- `load_alignments()`, it loads alignment saved by `conformer_ctc/ali.py`
- `convert_alignments_to_tensor()`, it converts alignments to PyTorch tensors
- `lookup_alignments()`, it returns the alignments of utterances by giving the cut ID of the utterances.
--- a/egs/spgispeech/ASR/conformer_ctc/init.py
+++ b/egs/spgispeech/ASR/conformer_ctc/init.py
--- a/egs/spgispeech/ASR/conformer_ctc/ali.py
+++ b/egs/spgispeech/ASR/conformer_ctc/ali.py
@ -1,399 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Usage:
-    ./conformer_ctc/ali.py \
-            --exp-dir ./conformer_ctc/exp \
-            --lang-dir ./data/lang_bpe_500 \
-            --epoch 20 \
-            --avg 10 \
-            --max-duration 300 \
-            --dataset train-clean-100 \
-            --out-dir data/ali
-"""
-
-import argparse
-import logging
-from pathlib import Path
-
-import k2
-import numpy as np
-import torch
-from asr_datamodule import LibriSpeechAsrDataModule
-from conformer import Conformer
-from lhotse import CutSet
-from lhotse.features.io import FeaturesWriter, NumpyHdf5Writer
-
-from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.decode import one_best_decoding
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    encode_supervisions,
-    get_alignments,
-    setup_logger,
-)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=34,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=20,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        default="data/lang_bpe_500",
-        help="The lang dir",
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="conformer_ctc/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--out-dir",
-        type=str,
-        required=True,
-        help="""Output directory.
-        It contains 3 generated files:
-
-        - labels_xxx.h5
-        - aux_labels_xxx.h5
-        - cuts_xxx.json.gz
-
-        where xxx is the value of `--dataset`. For instance, if
-        `--dataset` is `train-clean-100`, it will contain 3 files:
-
-        - `labels_train-clean-100.h5`
-        - `aux_labels_train-clean-100.h5`
-        - `cuts_train-clean-100.json.gz`
-
-        Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
-        alignment. The difference is that labels_xxx.h5 contains repeats.
-        """,
-    )
-
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        required=True,
-        help="""The name of the dataset to compute alignments for.
-        Possible values are:
-            - test-clean.
-            - test-other
-            - train-clean-100
-            - train-clean-360
-            - train-other-500
-            - dev-clean
-            - dev-other
-        """,
-    )
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            "lm_dir": Path("data/lm"),
-            "feature_dim": 80,
-            "nhead": 8,
-            "attention_dim": 512,
-            "subsampling_factor": 4,
-            # Set it to 0 since attention decoder
-            # is not used for computing alignments
-            "num_decoder_layers": 0,
-            "vgg_frontend": False,
-            "use_feat_batchnorm": True,
-            "output_beam": 10,
-            "use_double_scores": True,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def compute_alignments(
-    model: torch.nn.Module,
-    dl: torch.utils.data.DataLoader,
-    labels_writer: FeaturesWriter,
-    aux_labels_writer: FeaturesWriter,
-    params: AttributeDict,
-    graph_compiler: BpeCtcTrainingGraphCompiler,
-) -> CutSet:
-    """Compute the framewise alignments of a dataset.
-
-    Args:
-      model:
-        The neural network model.
-      dl:
-        Dataloader containing the dataset.
-      params:
-        Parameters for computing alignments.
-      graph_compiler:
-        It converts token IDs to decoding graphs.
-    Returns:
-      Return a CutSet. Each cut has two custom fields: labels_alignment
-      and aux_labels_alignment, containing framewise alignments information.
-      Both are of type `lhotse.array.TemporalArray`. The difference between
-      the two alignments is that `labels_alignment` contain repeats.
-    """
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-    num_cuts = 0
-
-    device = graph_compiler.device
-    cuts = []
-    for batch_idx, batch in enumerate(dl):
-        feature = batch["inputs"]
-
-        # at entry, feature is [N, T, C]
-        assert feature.ndim == 3
-        feature = feature.to(device)
-
-        supervisions = batch["supervisions"]
-        cut_list = supervisions["cut"]
-
-        for cut in cut_list:
-            assert len(cut.supervisions) == 1, f"{len(cut.supervisions)}"
-
-        nnet_output, encoder_memory, memory_mask = model(feature, supervisions)
-        # nnet_output is [N, T, C]
-        supervision_segments, texts = encode_supervisions(
-            supervisions, subsampling_factor=params.subsampling_factor
-        )
-        # we need also to sort cut_ids as encode_supervisions()
-        # reorders "texts".
-        # In general, new2old is an identity map since lhotse sorts the returned
-        # cuts by duration in descending order
-        new2old = supervision_segments[:, 0].tolist()
-
-        cut_list = [cut_list[i] for i in new2old]
-
-        token_ids = graph_compiler.texts_to_ids(texts)
-        decoding_graph = graph_compiler.compile(token_ids)
-
-        dense_fsa_vec = k2.DenseFsaVec(
-            nnet_output,
-            supervision_segments,
-            allow_truncate=params.subsampling_factor - 1,
-        )
-
-        lattice = k2.intersect_dense(
-            decoding_graph,
-            dense_fsa_vec,
-            params.output_beam,
-        )
-
-        best_path = one_best_decoding(
-            lattice=lattice,
-            use_double_scores=params.use_double_scores,
-        )
-
-        labels_ali = get_alignments(best_path, kind="labels")
-        aux_labels_ali = get_alignments(best_path, kind="aux_labels")
-        assert len(labels_ali) == len(aux_labels_ali) == len(cut_list)
-        for cut, labels, aux_labels in zip(
-            cut_list, labels_ali, aux_labels_ali
-        ):
-            cut.labels_alignment = labels_writer.store_array(
-                key=cut.id,
-                value=np.asarray(labels, dtype=np.int32),
-                # frame shift is 0.01s, subsampling_factor is 4
-                frame_shift=0.04,
-                temporal_dim=0,
-                start=0,
-            )
-            cut.aux_labels_alignment = aux_labels_writer.store_array(
-                key=cut.id,
-                value=np.asarray(aux_labels, dtype=np.int32),
-                # frame shift is 0.01s, subsampling_factor is 4
-                frame_shift=0.04,
-                temporal_dim=0,
-                start=0,
-            )
-
-        cuts += cut_list
-
-        num_cuts += len(cut_list)
-
-        if batch_idx % 100 == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(
-                f"batch {batch_str}, cuts processed until now is {num_cuts}"
-            )
-
-    return CutSet.from_cuts(cuts)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-
-    args.enable_spec_aug = False
-    args.enable_musan = False
-    args.return_cuts = True
-    args.concatenate_cuts = False
-
-    params = get_params()
-    params.update(vars(args))
-
-    setup_logger(f"{params.exp_dir}/log-ali")
-
-    logging.info(f"Computing alignments for {params.dataset} - started")
-    logging.info(params)
-
-    out_dir = Path(params.out_dir)
-    out_dir.mkdir(exist_ok=True)
-
-    out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
-    out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
-    out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
-
-    for f in (
-        out_labels_ali_filename,
-        out_aux_labels_ali_filename,
-        out_manifest_filename,
-    ):
-        if f.exists():
-            logging.info(f"{f} exists - skipping")
-            return
-
-    lexicon = Lexicon(params.lang_dir)
-    max_token_id = max(lexicon.tokens)
-    num_classes = max_token_id + 1  # +1 for the blank
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-    logging.info(f"device: {device}")
-
-    graph_compiler = BpeCtcTrainingGraphCompiler(
-        params.lang_dir,
-        device=device,
-        sos_token="<sos/eos>",
-        eos_token="<sos/eos>",
-    )
-
-    logging.info("About to create model")
-    model = Conformer(
-        num_features=params.feature_dim,
-        nhead=params.nhead,
-        d_model=params.attention_dim,
-        num_classes=num_classes,
-        subsampling_factor=params.subsampling_factor,
-        num_decoder_layers=params.num_decoder_layers,
-        vgg_frontend=params.vgg_frontend,
-        use_feat_batchnorm=params.use_feat_batchnorm,
-    )
-    model.to(device)
-
-    if params.avg == 1:
-        load_checkpoint(
-            f"{params.exp_dir}/epoch-{params.epoch}.pt", model, strict=False
-        )
-    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.load_state_dict(
-            average_checkpoints(filenames, device=device), strict=False
-        )
-
-    model.eval()
-
-    librispeech = LibriSpeechAsrDataModule(args)
-    if params.dataset == "test-clean":
-        test_clean_cuts = librispeech.test_clean_cuts()
-        dl = librispeech.test_dataloaders(test_clean_cuts)
-    elif params.dataset == "test-other":
-        test_other_cuts = librispeech.test_other_cuts()
-        dl = librispeech.test_dataloaders(test_other_cuts)
-    elif params.dataset == "train-clean-100":
-        train_clean_100_cuts = librispeech.train_clean_100_cuts()
-        dl = librispeech.train_dataloaders(train_clean_100_cuts)
-    elif params.dataset == "train-clean-360":
-        train_clean_360_cuts = librispeech.train_clean_360_cuts()
-        dl = librispeech.train_dataloaders(train_clean_360_cuts)
-    elif params.dataset == "train-other-500":
-        train_other_500_cuts = librispeech.train_other_500_cuts()
-        dl = librispeech.train_dataloaders(train_other_500_cuts)
-    elif params.dataset == "dev-clean":
-        dev_clean_cuts = librispeech.dev_clean_cuts()
-        dl = librispeech.valid_dataloaders(dev_clean_cuts)
-    else:
-        assert params.dataset == "dev-other", f"{params.dataset}"
-        dev_other_cuts = librispeech.dev_other_cuts()
-        dl = librispeech.valid_dataloaders(dev_other_cuts)
-
-    logging.info(f"Processing {params.dataset}")
-    with NumpyHdf5Writer(out_labels_ali_filename) as labels_writer:
-        with NumpyHdf5Writer(out_aux_labels_ali_filename) as aux_labels_writer:
-            cut_set = compute_alignments(
-                model=model,
-                dl=dl,
-                labels_writer=labels_writer,
-                aux_labels_writer=aux_labels_writer,
-                params=params,
-                graph_compiler=graph_compiler,
-            )
-
-    cut_set.to_file(out_manifest_filename)
-
-    logging.info(
-        f"For dataset {params.dataset}, its alignments with repeats are "
-        f"saved to {out_labels_ali_filename}, the alignments without repeats "
-        f"are saved to {out_aux_labels_ali_filename}, and the cut manifest "
-        f"file is {out_manifest_filename}. Number of cuts: {len(cut_set)}"
-    )
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/conformer_ctc/asr_datamodule.py
+++ b/egs/spgispeech/ASR/conformer_ctc/asr_datamodule.py
@ -1,355 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import logging
-from functools import lru_cache
-from pathlib import Path
-from typing import Optional, Dict, Any
-
-from tqdm import tqdm
-
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
-from lhotse.dataset import (
-    CutMix,
-    CutConcatenate,
-    DynamicBucketingSampler,
-    K2SpeechRecognitionDataset,
-    PrecomputedFeatures,
-    SpecAugment,
-)
-from lhotse.dataset.input_strategies import OnTheFlyFeatures
-from lhotse.utils import fix_random_seed
-from torch.utils.data import DataLoader
-import torch
-
-from icefall.utils import str2bool
-
-
-class _SeedWorkers:
-    def __init__(self, seed: int):
-        self.seed = seed
-
-    def __call__(self, worker_id: int):
-        fix_random_seed(self.seed + worker_id)
-
-
-class SPGISpeechAsrDataModule:
-    """
-    DataModule for k2 ASR experiments.
-    It assumes there is always one train and valid dataloader,
-    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
-    and test-other).
-    It contains all the common data pipeline modules used in ASR
-    experiments, e.g.:
-    - dynamic batch size,
-    - bucketing samplers,
-    - cut concatenation,
-    - augmentation,
-    - on-the-fly feature extraction
-    This class should be derived for specific corpora used in ASR tasks.
-    """
-
-    def __init__(self, args: argparse.Namespace):
-        self.args = args
-
-    @classmethod
-    def add_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(
-            title="ASR data related options",
-            description="These options are used for the preparation of "
-            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
-            "effective batch sizes, sampling strategies, applied data "
-            "augmentations, etc.",
-        )
-        group.add_argument(
-            "--manifest-dir",
-            type=Path,
-            default=Path("data/manifests"),
-            help="Path to directory with train/valid/test cuts.",
-        )
-        group.add_argument(
-            "--enable-musan",
-            type=str2bool,
-            default=True,
-            help="When enabled, select noise from MUSAN and mix it "
-            "with training dataset. ",
-        )
-        group.add_argument(
-            "--concatenate-cuts",
-            type=str2bool,
-            default=False,
-            help="When enabled, utterances (cuts) will be concatenated "
-            "to minimize the amount of padding.",
-        )
-        group.add_argument(
-            "--duration-factor",
-            type=float,
-            default=1.0,
-            help="Determines the maximum duration of a concatenated cut "
-            "relative to the duration of the longest cut in a batch.",
-        )
-        group.add_argument(
-            "--gap",
-            type=float,
-            default=1.0,
-            help="The amount of padding (in seconds) inserted between "
-            "concatenated cuts. This padding is filled with noise when "
-            "noise augmentation is used.",
-        )
-        group.add_argument(
-            "--max-duration",
-            type=int,
-            default=100.0,
-            help="Maximum pooled recordings duration (seconds) in a "
-            "single batch. You can reduce it if it causes CUDA OOM.",
-        )
-        group.add_argument(
-            "--num-buckets",
-            type=int,
-            default=30,
-            help="The number of buckets for the BucketingSampler"
-            "(you might want to increase it for larger datasets).",
-        )
-        group.add_argument(
-            "--on-the-fly-feats",
-            type=str2bool,
-            default=False,
-            help="When enabled, use on-the-fly cut mixing and feature "
-            "extraction. Will drop existing precomputed feature manifests "
-            "if available.",
-        )
-        group.add_argument(
-            "--shuffle",
-            type=str2bool,
-            default=True,
-            help="When enabled (=default), the examples will be "
-            "shuffled for each epoch.",
-        )
-
-        group.add_argument(
-            "--num-workers",
-            type=int,
-            default=8,
-            help="The number of training dataloader workers that "
-            "collect the batches.",
-        )
-        group.add_argument(
-            "--enable-spec-aug",
-            type=str2bool,
-            default=True,
-            help="When enabled, use SpecAugment for training dataset.",
-        )
-        group.add_argument(
-            "--spec-aug-time-warp-factor",
-            type=int,
-            default=80,
-            help="Used only when --enable-spec-aug is True. "
-            "It specifies the factor for time warping in SpecAugment. "
-            "Larger values mean more warping. "
-            "A value less than 1 means to disable time warp.",
-        )
-
-    def train_dataloaders(
-        self,
-        cuts_train: CutSet,
-        sampler_state_dict: Optional[Dict[str, Any]] = None,
-    ) -> DataLoader:
-        """
-        Args:
-          cuts_train:
-            CutSet for training.
-          sampler_state_dict:
-            The state dict for the training sampler.
-        """
-        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(self.args.manifest_dir / "cuts_musan.jsonl.gz")
-
-        transforms = []
-        if self.args.enable_musan:
-            logging.info("Enable MUSAN")
-            transforms.append(
-                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
-            )
-        else:
-            logging.info("Disable MUSAN")
-
-        if self.args.concatenate_cuts:
-            logging.info(
-                f"Using cut concatenation with duration factor "
-                f"{self.args.duration_factor} and gap {self.args.gap}."
-            )
-            # Cut concatenation should be the first transform in the list,
-            # so that if we e.g. mix noise in, it will fill the gaps between
-            # different utterances.
-            transforms = [
-                CutConcatenate(
-                    duration_factor=self.args.duration_factor, gap=self.args.gap
-                )
-            ] + transforms
-
-        input_transforms = []
-        if self.args.enable_spec_aug:
-            logging.info("Enable SpecAugment")
-            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
-            input_transforms.append(
-                SpecAugment(
-                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=2,
-                    features_mask_size=27,
-                    num_feature_masks=2,
-                    frames_mask_size=100,
-                )
-            )
-        else:
-            logging.info("Disable SpecAugment")
-
-        logging.info("About to create train dataset")
-        if self.args.on_the_fly_feats:
-            train = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
-                input_transforms=input_transforms,
-            )
-        else:
-            train = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_transforms=input_transforms,
-            )
-
-        logging.info("Using DynamicBucketingSampler.")
-        train_sampler = DynamicBucketingSampler(
-            cuts_train,
-            max_duration=self.args.max_duration,
-            shuffle=True,
-            num_buckets=self.args.num_buckets,
-            drop_last=True,
-        )
-        logging.info("About to create train dataloader")
-
-        if sampler_state_dict is not None:
-            logging.info("Loading sampler state dict")
-            train_sampler.load_state_dict(sampler_state_dict)
-
-        # 'seed' is derived from the current random state, which will have
-        # previously been set in the main process.
-        seed = torch.randint(0, 100000, ()).item()
-        worker_init_fn = _SeedWorkers(seed)
-
-        train_dl = DataLoader(
-            train,
-            sampler=train_sampler,
-            batch_size=None,
-            num_workers=self.args.num_workers,
-            persistent_workers=False,
-            worker_init_fn=worker_init_fn,
-        )
-
-        return train_dl
-
-    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
-
-        transforms = []
-        if self.args.concatenate_cuts:
-            transforms = [
-                CutConcatenate(
-                    duration_factor=self.args.duration_factor, gap=self.args.gap
-                )
-            ] + transforms
-
-        logging.info("About to create dev dataset")
-        if self.args.on_the_fly_feats:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
-            )
-        else:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-            )
-        valid_sampler = DynamicBucketingSampler(
-            cuts_valid,
-            max_duration=self.args.max_duration,
-            shuffle=False,
-        )
-        logging.info("About to create dev dataloader")
-        valid_dl = DataLoader(
-            validate,
-            sampler=valid_sampler,
-            batch_size=None,
-            num_workers=2,
-            persistent_workers=False,
-        )
-
-        return valid_dl
-
-    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
-        logging.debug("About to create test dataset")
-        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
-        )
-        sampler = DynamicBucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
-        )
-        logging.debug("About to create test dataloader")
-        test_dl = DataLoader(
-            test,
-            batch_size=None,
-            sampler=sampler,
-            num_workers=self.args.num_workers,
-        )
-        return test_dl
-
-    @lru_cache()
-    def train_cuts(self) -> CutSet:
-        logging.info("About to get SPGISpeech train cuts")
-        return load_manifest_lazy(self.args.manifest_dir / "cuts_train.jsonl.gz")
-
-    @lru_cache()
-    def dev_cuts(self) -> CutSet:
-        logging.info("About to get SPGISpeech dev cuts")
-        return load_manifest_lazy(self.args.manifest_dir / "cuts_dev.jsonl.gz")
-
-    @lru_cache()
-    def val_cuts(self) -> CutSet:
-        logging.info("About to get SPGISpeech val cuts")
-        return load_manifest_lazy(self.args.manifest_dir / "cuts_val.jsonl.gz")
-
-
-def test():
-    parser = argparse.ArgumentParser()
-    SPGISpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    adm = SPGISpeechAsrDataModule(args)
-
-    cuts = adm.train_cuts()
-    dl = adm.train_dataloaders(cuts)
-    for i, batch in tqdm(enumerate(dl)):
-        if i == 100:
-            break
-
-    cuts = adm.dev_cuts()
-    dl = adm.valid_dataloaders(cuts)
-    for i, batch in tqdm(enumerate(dl)):
-        if i == 100:
-            break
-
-
-if __name__ == "__main__":
-    test()
--- a/egs/spgispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/spgispeech/ASR/conformer_ctc/conformer.py
@ -1,930 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c)  2021  University of Chinese Academy of Sciences (author: Han Zhu)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import warnings
-from typing import Optional, Tuple, Union
-
-import torch
-from torch import Tensor, nn
-from transformer import Supervisions, Transformer, encoder_padding_mask
-
-
-class Conformer(Transformer):
-    """
-    Args:
-        num_features (int): Number of input features
-        num_classes (int): Number of output classes
-        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
-        d_model (int): attention dimension
-        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
-        num_encoder_layers (int): number of encoder layers
-        num_decoder_layers (int): number of decoder layers
-        dropout (float): dropout rate
-        cnn_module_kernel (int): Kernel size of convolution module
-        normalize_before (bool): whether to use layer_norm before the first block.
-        vgg_frontend (bool): whether to use vgg frontend.
-    """
-
-    def __init__(
-        self,
-        num_features: int,
-        num_classes: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        num_decoder_layers: int = 6,
-        dropout: float = 0.1,
-        cnn_module_kernel: int = 31,
-        normalize_before: bool = True,
-        vgg_frontend: bool = False,
-        use_feat_batchnorm: Union[float, bool] = 0.1,
-    ) -> None:
-        super(Conformer, self).__init__(
-            num_features=num_features,
-            num_classes=num_classes,
-            subsampling_factor=subsampling_factor,
-            d_model=d_model,
-            nhead=nhead,
-            dim_feedforward=dim_feedforward,
-            num_encoder_layers=num_encoder_layers,
-            num_decoder_layers=num_decoder_layers,
-            dropout=dropout,
-            normalize_before=normalize_before,
-            vgg_frontend=vgg_frontend,
-            use_feat_batchnorm=use_feat_batchnorm,
-        )
-
-        self.encoder_pos = RelPositionalEncoding(d_model, dropout)
-
-        use_conv_batchnorm = True
-        if isinstance(use_feat_batchnorm, float):
-            use_conv_batchnorm = False
-        encoder_layer = ConformerEncoderLayer(
-            d_model,
-            nhead,
-            dim_feedforward,
-            dropout,
-            cnn_module_kernel,
-            normalize_before,
-            use_conv_batchnorm,
-        )
-        self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers)
-        self.normalize_before = normalize_before
-        if self.normalize_before:
-            self.after_norm = nn.LayerNorm(d_model)
-        else:
-            # Note: TorchScript detects that self.after_norm could be used inside forward()
-            #       and throws an error without this change.
-            self.after_norm = identity
-
-    def run_encoder(
-        self, x: Tensor, supervisions: Optional[Supervisions] = None
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        """
-        Args:
-          x:
-            The model input. Its shape is (N, T, C).
-          supervisions:
-            Supervision in lhotse format.
-            See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32  # noqa
-            CAUTION: It contains length information, i.e., start and number of
-            frames, before subsampling
-            It is read directly from the batch, without any sorting. It is used
-            to compute encoder padding mask, which is used as memory key padding
-            mask for the decoder.
-
-        Returns:
-            Tensor: Predictor tensor of dimension (input_length, batch_size, d_model).
-            Tensor: Mask tensor of dimension (batch_size, input_length)
-        """
-        x = self.encoder_embed(x)
-        x, pos_emb = self.encoder_pos(x)
-        x = x.permute(1, 0, 2)  # (B, T, F) -> (T, B, F)
-        mask = encoder_padding_mask(x.size(0), supervisions)
-        if mask is not None:
-            mask = mask.to(x.device)
-        x = self.encoder(x, pos_emb, src_key_padding_mask=mask)  # (T, B, F)
-
-        if self.normalize_before:
-            x = self.after_norm(x)
-
-        return x, mask
-
-
-class ConformerEncoderLayer(nn.Module):
-    """
-    ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.
-    See: "Conformer: Convolution-augmented Transformer for Speech Recognition"
-
-    Args:
-        d_model: the number of expected features in the input (required).
-        nhead: the number of heads in the multiheadattention models (required).
-        dim_feedforward: the dimension of the feedforward network model (default=2048).
-        dropout: the dropout value (default=0.1).
-        cnn_module_kernel (int): Kernel size of convolution module.
-        normalize_before: whether to use layer_norm before the first block.
-
-    Examples::
-        >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
-        >>> src = torch.rand(10, 32, 512)
-        >>> pos_emb = torch.rand(32, 19, 512)
-        >>> out = encoder_layer(src, pos_emb)
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        cnn_module_kernel: int = 31,
-        normalize_before: bool = True,
-        use_conv_batchnorm: bool = False,
-    ) -> None:
-        super(ConformerEncoderLayer, self).__init__()
-        self.self_attn = RelPositionMultiheadAttention(
-            d_model, nhead, dropout=0.0
-        )
-
-        self.feed_forward = nn.Sequential(
-            nn.Linear(d_model, dim_feedforward),
-            Swish(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_feedforward, d_model),
-        )
-
-        self.feed_forward_macaron = nn.Sequential(
-            nn.Linear(d_model, dim_feedforward),
-            Swish(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_feedforward, d_model),
-        )
-
-        self.conv_module = ConvolutionModule(
-            d_model, cnn_module_kernel, use_batchnorm=use_conv_batchnorm
-        )
-
-        self.norm_ff_macaron = nn.LayerNorm(
-            d_model
-        )  # for the macaron style FNN module
-        self.norm_ff = nn.LayerNorm(d_model)  # for the FNN module
-        self.norm_mha = nn.LayerNorm(d_model)  # for the MHA module
-
-        self.ff_scale = 0.5
-
-        self.norm_conv = nn.LayerNorm(d_model)  # for the CNN module
-        self.norm_final = nn.LayerNorm(
-            d_model
-        )  # for the final output of the block
-
-        self.dropout = nn.Dropout(dropout)
-
-        self.normalize_before = normalize_before
-
-    def forward(
-        self,
-        src: Tensor,
-        pos_emb: Tensor,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        """
-        Pass the input through the encoder layer.
-
-        Args:
-            src: the sequence to the encoder layer (required).
-            pos_emb: Positional embedding tensor (required).
-            src_mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-
-        Shape:
-            src: (S, N, E).
-            pos_emb: (N, 2*S-1, E)
-            src_mask: (S, S).
-            src_key_padding_mask: (N, S).
-            S is the source sequence length, N is the batch size, E is the feature number
-        """
-
-        # macaron style feed forward module
-        residual = src
-        if self.normalize_before:
-            src = self.norm_ff_macaron(src)
-        src = residual + self.ff_scale * self.dropout(
-            self.feed_forward_macaron(src)
-        )
-        if not self.normalize_before:
-            src = self.norm_ff_macaron(src)
-
-        # multi-headed self-attention module
-        residual = src
-        if self.normalize_before:
-            src = self.norm_mha(src)
-        src_att = self.self_attn(
-            src,
-            src,
-            src,
-            pos_emb=pos_emb,
-            attn_mask=src_mask,
-            key_padding_mask=src_key_padding_mask,
-        )[0]
-        src = residual + self.dropout(src_att)
-        if not self.normalize_before:
-            src = self.norm_mha(src)
-
-        # convolution module
-        residual = src
-        if self.normalize_before:
-            src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
-        if not self.normalize_before:
-            src = self.norm_conv(src)
-
-        # feed forward module
-        residual = src
-        if self.normalize_before:
-            src = self.norm_ff(src)
-        src = residual + self.ff_scale * self.dropout(self.feed_forward(src))
-        if not self.normalize_before:
-            src = self.norm_ff(src)
-
-        if self.normalize_before:
-            src = self.norm_final(src)
-
-        return src
-
-
-class ConformerEncoder(nn.TransformerEncoder):
-    r"""ConformerEncoder is a stack of N encoder layers
-
-    Args:
-        encoder_layer: an instance of the ConformerEncoderLayer() class (required).
-        num_layers: the number of sub-encoder-layers in the encoder (required).
-        norm: the layer normalization component (optional).
-
-    Examples::
-        >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
-        >>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6)
-        >>> src = torch.rand(10, 32, 512)
-        >>> pos_emb = torch.rand(32, 19, 512)
-        >>> out = conformer_encoder(src, pos_emb)
-    """
-
-    def __init__(
-        self, encoder_layer: nn.Module, num_layers: int, norm: nn.Module = None
-    ) -> None:
-        super(ConformerEncoder, self).__init__(
-            encoder_layer=encoder_layer, num_layers=num_layers, norm=norm
-        )
-
-    def forward(
-        self,
-        src: Tensor,
-        pos_emb: Tensor,
-        mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        r"""Pass the input through the encoder layers in turn.
-
-        Args:
-            src: the sequence to the encoder (required).
-            pos_emb: Positional embedding tensor (required).
-            mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-
-        Shape:
-            src: (S, N, E).
-            pos_emb: (N, 2*S-1, E)
-            mask: (S, S).
-            src_key_padding_mask: (N, S).
-            S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number
-
-        """
-        output = src
-
-        for mod in self.layers:
-            output = mod(
-                output,
-                pos_emb,
-                src_mask=mask,
-                src_key_padding_mask=src_key_padding_mask,
-            )
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        return output
-
-
-class RelPositionalEncoding(torch.nn.Module):
-    """Relative positional encoding module.
-
-    See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
-    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py
-
-    Args:
-        d_model: Embedding dimension.
-        dropout_rate: Dropout rate.
-        max_len: Maximum input length.
-
-    """
-
-    def __init__(
-        self, d_model: int, dropout_rate: float, max_len: int = 5000
-    ) -> None:
-        """Construct an PositionalEncoding object."""
-        super(RelPositionalEncoding, self).__init__()
-        self.d_model = d_model
-        self.xscale = math.sqrt(self.d_model)
-        self.dropout = torch.nn.Dropout(p=dropout_rate)
-        self.pe = None
-        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
-
-    def extend_pe(self, x: Tensor) -> None:
-        """Reset the positional encodings."""
-        if self.pe is not None:
-            # self.pe contains both positive and negative parts
-            # the length of self.pe is 2 * input_len - 1
-            if self.pe.size(1) >= x.size(1) * 2 - 1:
-                # Note: TorchScript doesn't implement operator== for torch.Device
-                if self.pe.dtype != x.dtype or str(self.pe.device) != str(
-                    x.device
-                ):
-                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
-        # Suppose `i` means to the position of query vecotr and `j` means the
-        # position of key vector. We use position relative positions when keys
-        # are to the left (i>j) and negative relative positions otherwise (i<j).
-        pe_positive = torch.zeros(x.size(1), self.d_model)
-        pe_negative = torch.zeros(x.size(1), self.d_model)
-        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.d_model)
-        )
-        pe_positive[:, 0::2] = torch.sin(position * div_term)
-        pe_positive[:, 1::2] = torch.cos(position * div_term)
-        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
-        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
-
-        # Reserve the order of positive indices and concat both positive and
-        # negative indices. This is used to support the shifting trick
-        # as in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
-        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
-        pe_negative = pe_negative[1:].unsqueeze(0)
-        pe = torch.cat([pe_positive, pe_negative], dim=1)
-        self.pe = pe.to(device=x.device, dtype=x.dtype)
-
-    def forward(self, x: torch.Tensor) -> Tuple[Tensor, Tensor]:
-        """Add positional encoding.
-
-        Args:
-            x (torch.Tensor): Input tensor (batch, time, `*`).
-
-        Returns:
-            torch.Tensor: Encoded tensor (batch, time, `*`).
-            torch.Tensor: Encoded tensor (batch, 2*time-1, `*`).
-
-        """
-        self.extend_pe(x)
-        x = x * self.xscale
-        pos_emb = self.pe[
-            :,
-            self.pe.size(1) // 2
-            - x.size(1)
-            + 1 : self.pe.size(1) // 2  # noqa E203
-            + x.size(1),
-        ]
-        return self.dropout(x), self.dropout(pos_emb)
-
-
-class RelPositionMultiheadAttention(nn.Module):
-    r"""Multi-Head Attention layer with relative position encoding
-
-    See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
-
-    Args:
-        embed_dim: total dimension of the model.
-        num_heads: parallel attention heads.
-        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
-
-    Examples::
-
-        >>> rel_pos_multihead_attn = RelPositionMultiheadAttention(embed_dim, num_heads)
-        >>> attn_output, attn_output_weights = multihead_attn(query, key, value, pos_emb)
-    """
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-    ) -> None:
-        super(RelPositionMultiheadAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
-
-        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=True)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
-
-        # linear transformation for positional encoding.
-        self.linear_pos = nn.Linear(embed_dim, embed_dim, bias=False)
-        # these two learnable bias are used in matrix c and matrix d
-        # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3
-        self.pos_bias_u = nn.Parameter(torch.Tensor(num_heads, self.head_dim))
-        self.pos_bias_v = nn.Parameter(torch.Tensor(num_heads, self.head_dim))
-
-        self._reset_parameters()
-
-    def _reset_parameters(self) -> None:
-        nn.init.xavier_uniform_(self.in_proj.weight)
-        nn.init.constant_(self.in_proj.bias, 0.0)
-        nn.init.constant_(self.out_proj.bias, 0.0)
-
-        nn.init.xavier_uniform_(self.pos_bias_u)
-        nn.init.xavier_uniform_(self.pos_bias_v)
-
-    def forward(
-        self,
-        query: Tensor,
-        key: Tensor,
-        value: Tensor,
-        pos_emb: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-        need_weights: bool = True,
-        attn_mask: Optional[Tensor] = None,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        r"""
-        Args:
-            query, key, value: map a query and a set of key-value pairs to an output.
-            pos_emb: Positional embedding tensor
-            key_padding_mask: if provided, specified padding elements in the key will
-                be ignored by the attention. When given a binary mask and a value is True,
-                the corresponding value on the attention layer will be ignored. When given
-                a byte mask and a value is non-zero, the corresponding value on the attention
-                layer will be ignored
-            need_weights: output attn_output_weights.
-            attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
-                the batches while a 3D mask allows to specify a different mask for the entries of each batch.
-
-        Shape:
-            - Inputs:
-            - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
-            the embedding dimension.
-            - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
-            the embedding dimension.
-            - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
-            the embedding dimension.
-            - pos_emb: :math:`(N, 2*L-1, E)` where L is the target sequence length, N is the batch size, E is
-            the embedding dimension.
-            - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
-            If a ByteTensor is provided, the non-zero positions will be ignored while the position
-            with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
-            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
-            - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
-            3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
-            S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
-            positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
-            while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
-            is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
-            is provided, it will be added to the attention weight.
-
-            - Outputs:
-            - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
-            E is the embedding dimension.
-            - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
-            L is the target sequence length, S is the source sequence length.
-        """
-        return self.multi_head_attention_forward(
-            query,
-            key,
-            value,
-            pos_emb,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj.weight,
-            self.in_proj.bias,
-            self.dropout,
-            self.out_proj.weight,
-            self.out_proj.bias,
-            training=self.training,
-            key_padding_mask=key_padding_mask,
-            need_weights=need_weights,
-            attn_mask=attn_mask,
-        )
-
-    def rel_shift(self, x: Tensor) -> Tensor:
-        """Compute relative positional encoding.
-
-        Args:
-            x: Input tensor (batch, head, time1, 2*time1-1).
-                time1 means the length of query vector.
-
-        Returns:
-            Tensor: tensor of shape (batch, head, time1, time2)
-          (note: time2 has the same value as time1, but it is for
-          the key, while time1 is for the query).
-        """
-        (batch_size, num_heads, time1, n) = x.shape
-        assert n == 2 * time1 - 1
-        # Note: TorchScript requires explicit arg for stride()
-        batch_stride = x.stride(0)
-        head_stride = x.stride(1)
-        time1_stride = x.stride(2)
-        n_stride = x.stride(3)
-        return x.as_strided(
-            (batch_size, num_heads, time1, time1),
-            (batch_stride, head_stride, time1_stride - n_stride, n_stride),
-            storage_offset=n_stride * (time1 - 1),
-        )
-
-    def multi_head_attention_forward(
-        self,
-        query: Tensor,
-        key: Tensor,
-        value: Tensor,
-        pos_emb: Tensor,
-        embed_dim_to_check: int,
-        num_heads: int,
-        in_proj_weight: Tensor,
-        in_proj_bias: Tensor,
-        dropout_p: float,
-        out_proj_weight: Tensor,
-        out_proj_bias: Tensor,
-        training: bool = True,
-        key_padding_mask: Optional[Tensor] = None,
-        need_weights: bool = True,
-        attn_mask: Optional[Tensor] = None,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        r"""
-        Args:
-            query, key, value: map a query and a set of key-value pairs to an output.
-            pos_emb: Positional embedding tensor
-            embed_dim_to_check: total dimension of the model.
-            num_heads: parallel attention heads.
-            in_proj_weight, in_proj_bias: input projection weight and bias.
-            dropout_p: probability of an element to be zeroed.
-            out_proj_weight, out_proj_bias: the output projection weight and bias.
-            training: apply dropout if is ``True``.
-            key_padding_mask: if provided, specified padding elements in the key will
-                be ignored by the attention. This is an binary mask. When the value is True,
-                the corresponding value on the attention layer will be filled with -inf.
-            need_weights: output attn_output_weights.
-            attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
-                the batches while a 3D mask allows to specify a different mask for the entries of each batch.
-
-        Shape:
-            Inputs:
-            - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
-            the embedding dimension.
-            - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
-            the embedding dimension.
-            - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
-            the embedding dimension.
-            - pos_emb: :math:`(N, 2*L-1, E)` or :math:`(1, 2*L-1, E)` where L is the target sequence
-            length, N is the batch size, E is the embedding dimension.
-            - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
-            If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
-            will be unchanged. If a BoolTensor is provided, the positions with the
-            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
-            - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
-            3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
-            S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
-            positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
-            while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
-            are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
-            is provided, it will be added to the attention weight.
-
-            Outputs:
-            - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
-            E is the embedding dimension.
-            - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
-            L is the target sequence length, S is the source sequence length.
-        """
-
-        tgt_len, bsz, embed_dim = query.size()
-        assert embed_dim == embed_dim_to_check
-        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
-
-        head_dim = embed_dim // num_heads
-        assert (
-            head_dim * num_heads == embed_dim
-        ), "embed_dim must be divisible by num_heads"
-        scaling = float(head_dim) ** -0.5
-
-        if torch.equal(query, key) and torch.equal(key, value):
-            # self-attention
-            q, k, v = nn.functional.linear(
-                query, in_proj_weight, in_proj_bias
-            ).chunk(3, dim=-1)
-
-        elif torch.equal(key, value):
-            # encoder-decoder attention
-            # This is inline in_proj function with in_proj_weight and in_proj_bias
-            _b = in_proj_bias
-            _start = 0
-            _end = embed_dim
-            _w = in_proj_weight[_start:_end, :]
-            if _b is not None:
-                _b = _b[_start:_end]
-            q = nn.functional.linear(query, _w, _b)
-            # This is inline in_proj function with in_proj_weight and in_proj_bias
-            _b = in_proj_bias
-            _start = embed_dim
-            _end = None
-            _w = in_proj_weight[_start:, :]
-            if _b is not None:
-                _b = _b[_start:]
-            k, v = nn.functional.linear(key, _w, _b).chunk(2, dim=-1)
-
-        else:
-            # This is inline in_proj function with in_proj_weight and in_proj_bias
-            _b = in_proj_bias
-            _start = 0
-            _end = embed_dim
-            _w = in_proj_weight[_start:_end, :]
-            if _b is not None:
-                _b = _b[_start:_end]
-            q = nn.functional.linear(query, _w, _b)
-
-            # This is inline in_proj function with in_proj_weight and in_proj_bias
-            _b = in_proj_bias
-            _start = embed_dim
-            _end = embed_dim * 2
-            _w = in_proj_weight[_start:_end, :]
-            if _b is not None:
-                _b = _b[_start:_end]
-            k = nn.functional.linear(key, _w, _b)
-
-            # This is inline in_proj function with in_proj_weight and in_proj_bias
-            _b = in_proj_bias
-            _start = embed_dim * 2
-            _end = None
-            _w = in_proj_weight[_start:, :]
-            if _b is not None:
-                _b = _b[_start:]
-            v = nn.functional.linear(value, _w, _b)
-
-        if attn_mask is not None:
-            assert (
-                attn_mask.dtype == torch.float32
-                or attn_mask.dtype == torch.float64
-                or attn_mask.dtype == torch.float16
-                or attn_mask.dtype == torch.uint8
-                or attn_mask.dtype == torch.bool
-            ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(
-                attn_mask.dtype
-            )
-            if attn_mask.dtype == torch.uint8:
-                warnings.warn(
-                    "Byte tensor for attn_mask is deprecated. Use bool tensor instead."
-                )
-                attn_mask = attn_mask.to(torch.bool)
-
-            if attn_mask.dim() == 2:
-                attn_mask = attn_mask.unsqueeze(0)
-                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
-                    raise RuntimeError(
-                        "The size of the 2D attn_mask is not correct."
-                    )
-            elif attn_mask.dim() == 3:
-                if list(attn_mask.size()) != [
-                    bsz * num_heads,
-                    query.size(0),
-                    key.size(0),
-                ]:
-                    raise RuntimeError(
-                        "The size of the 3D attn_mask is not correct."
-                    )
-            else:
-                raise RuntimeError(
-                    "attn_mask's dimension {} is not supported".format(
-                        attn_mask.dim()
-                    )
-                )
-            # attn_mask's dim is 3 now.
-
-        # convert ByteTensor key_padding_mask to bool
-        if (
-            key_padding_mask is not None
-            and key_padding_mask.dtype == torch.uint8
-        ):
-            warnings.warn(
-                "Byte tensor for key_padding_mask is deprecated. Use bool tensor instead."
-            )
-            key_padding_mask = key_padding_mask.to(torch.bool)
-
-        q = q.contiguous().view(tgt_len, bsz, num_heads, head_dim)
-        k = k.contiguous().view(-1, bsz, num_heads, head_dim)
-        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
-
-        src_len = k.size(0)
-
-        if key_padding_mask is not None:
-            assert key_padding_mask.size(0) == bsz, "{} == {}".format(
-                key_padding_mask.size(0), bsz
-            )
-            assert key_padding_mask.size(1) == src_len, "{} == {}".format(
-                key_padding_mask.size(1), src_len
-            )
-
-        q = q.transpose(0, 1)  # (batch, time1, head, d_k)
-
-        pos_emb_bsz = pos_emb.size(0)
-        assert pos_emb_bsz in (1, bsz)  # actually it is 1
-        p = self.linear_pos(pos_emb).view(pos_emb_bsz, -1, num_heads, head_dim)
-        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)
-
-        q_with_bias_u = (q + self.pos_bias_u).transpose(
-            1, 2
-        )  # (batch, head, time1, d_k)
-
-        q_with_bias_v = (q + self.pos_bias_v).transpose(
-            1, 2
-        )  # (batch, head, time1, d_k)
-
-        # compute attention score
-        # first compute matrix a and matrix c
-        # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3
-        k = k.permute(1, 2, 3, 0)  # (batch, head, d_k, time2)
-        matrix_ac = torch.matmul(
-            q_with_bias_u, k
-        )  # (batch, head, time1, time2)
-
-        # compute matrix b and matrix d
-        matrix_bd = torch.matmul(
-            q_with_bias_v, p.transpose(-2, -1)
-        )  # (batch, head, time1, 2*time1-1)
-        matrix_bd = self.rel_shift(matrix_bd)
-
-        attn_output_weights = (
-            matrix_ac + matrix_bd
-        ) * scaling  # (batch, head, time1, time2)
-
-        attn_output_weights = attn_output_weights.view(
-            bsz * num_heads, tgt_len, -1
-        )
-
-        assert list(attn_output_weights.size()) == [
-            bsz * num_heads,
-            tgt_len,
-            src_len,
-        ]
-
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_output_weights.masked_fill_(attn_mask, float("-inf"))
-            else:
-                attn_output_weights += attn_mask
-
-        if key_padding_mask is not None:
-            attn_output_weights = attn_output_weights.view(
-                bsz, num_heads, tgt_len, src_len
-            )
-            attn_output_weights = attn_output_weights.masked_fill(
-                key_padding_mask.unsqueeze(1).unsqueeze(2),
-                float("-inf"),
-            )
-            attn_output_weights = attn_output_weights.view(
-                bsz * num_heads, tgt_len, src_len
-            )
-
-        attn_output_weights = nn.functional.softmax(attn_output_weights, dim=-1)
-        attn_output_weights = nn.functional.dropout(
-            attn_output_weights, p=dropout_p, training=training
-        )
-
-        attn_output = torch.bmm(attn_output_weights, v)
-        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
-        attn_output = (
-            attn_output.transpose(0, 1)
-            .contiguous()
-            .view(tgt_len, bsz, embed_dim)
-        )
-        attn_output = nn.functional.linear(
-            attn_output, out_proj_weight, out_proj_bias
-        )
-
-        if need_weights:
-            # average attention weights over heads
-            attn_output_weights = attn_output_weights.view(
-                bsz, num_heads, tgt_len, src_len
-            )
-            return attn_output, attn_output_weights.sum(dim=1) / num_heads
-        else:
-            return attn_output, None
-
-
-class ConvolutionModule(nn.Module):
-    """ConvolutionModule in Conformer model.
-    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py
-
-    Args:
-        channels (int): The number of channels of conv layers.
-        kernel_size (int): Kernerl size of conv layers.
-        bias (bool): Whether to use bias in conv layers (default=True).
-
-    """
-
-    def __init__(
-        self,
-        channels: int,
-        kernel_size: int,
-        bias: bool = True,
-        use_batchnorm: bool = False,
-    ) -> None:
-        """Construct an ConvolutionModule object."""
-        super(ConvolutionModule, self).__init__()
-        # kernerl_size should be a odd number for 'SAME' padding
-        assert (kernel_size - 1) % 2 == 0
-        self.use_batchnorm = use_batchnorm
-
-        self.pointwise_conv1 = nn.Conv1d(
-            channels,
-            2 * channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=bias,
-        )
-        self.depthwise_conv = nn.Conv1d(
-            channels,
-            channels,
-            kernel_size,
-            stride=1,
-            padding=(kernel_size - 1) // 2,
-            groups=channels,
-            bias=bias,
-        )
-        if self.use_batchnorm:
-            self.norm = nn.BatchNorm1d(channels)
-        self.pointwise_conv2 = nn.Conv1d(
-            channels,
-            channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=bias,
-        )
-        self.activation = Swish()
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Compute convolution module.
-
-        Args:
-            x: Input tensor (#time, batch, channels).
-
-        Returns:
-            Tensor: Output tensor (#time, batch, channels).
-
-        """
-        # exchange the temporal dimension and the feature dimension
-        x = x.permute(1, 2, 0)  # (#batch, channels, time).
-
-        # GLU mechanism
-        x = self.pointwise_conv1(x)  # (batch, 2*channels, time)
-        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
-
-        # 1D Depthwise Conv
-        x = self.depthwise_conv(x)
-        if self.use_batchnorm:
-            x = self.norm(x)
-        x = self.activation(x)
-
-        x = self.pointwise_conv2(x)  # (batch, channel, time)
-
-        return x.permute(2, 0, 1)
-
-
-class Swish(torch.nn.Module):
-    """Construct an Swish object."""
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swich activation function."""
-        return x * torch.sigmoid(x)
-
-
-def identity(x):
-    return x
--- a/egs/spgispeech/ASR/conformer_ctc/decode.py
+++ b/egs/spgispeech/ASR/conformer_ctc/decode.py
@ -1,694 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo, Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-from numpy import True_
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from asr_datamodule import SPGISpeechAsrDataModule
-from conformer import Conformer
-
-from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.decode import (
-    get_lattice,
-    nbest_decoding,
-    nbest_oracle,
-    one_best_decoding,
-    rescore_with_attention_decoder,
-    rescore_with_n_best_list,
-    rescore_with_whole_lattice,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    get_texts,
-    setup_logger,
-    store_transcripts,
-    write_error_stats,
-)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=77,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=55,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="attention-decoder",
-        help="""Decoding method.
-        Supported values are:
-            - (0) ctc-decoding. Use CTC decoding. It uses a sentence piece
-              model, i.e., lang_dir/bpe.model, to convert word pieces to words.
-              It needs neither a lexicon nor an n-gram LM.
-            - (1) 1best. Extract the best path from the decoding lattice as the
-              decoding result.
-            - (2) nbest. Extract n paths from the decoding lattice; the path
-              with the highest score is the decoding result.
-            - (3) nbest-rescoring. Extract n paths from the decoding lattice,
-              rescore them with an n-gram LM (e.g., a 4-gram LM), the path with
-              the highest score is the decoding result.
-            - (4) whole-lattice-rescoring. Rescore the decoding lattice with an
-              n-gram LM (e.g., a 4-gram LM), the best path of rescored lattice
-              is the decoding result.
-            - (5) attention-decoder. Extract n paths from the LM rescored
-              lattice, the path with the highest score is the decoding result.
-            - (6) nbest-oracle. Its WER is the lower bound of any n-best
-              rescoring method can achieve. Useful for debugging n-best
-              rescoring method.
-        """,
-    )
-
-    parser.add_argument(
-        "--num-paths",
-        type=int,
-        default=100,
-        help="""Number of paths for n-best based decoding method.
-        Used only when "method" is one of the following values:
-        nbest, nbest-rescoring, attention-decoder, and nbest-oracle
-        """,
-    )
-
-    parser.add_argument(
-        "--nbest-scale",
-        type=float,
-        default=0.5,
-        help="""The scale to be applied to `lattice.scores`.
-        It's needed if you use any kinds of n-best based rescoring.
-        Used only when "method" is one of the following values:
-        nbest, nbest-rescoring, attention-decoder, and nbest-oracle
-        A smaller value results in more unique paths.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="conformer_ctc/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        default="data/lang_bpe_5000",
-        help="The lang dir",
-    )
-
-    parser.add_argument(
-        "--lm-dir",
-        type=str,
-        default="data/lm",
-        help="""The LM dir.
-        It should contain either G_3_gram.pt or G_3_gram.fst.txt
-        """,
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "subsampling_factor": 4,
-            "vgg_frontend": False,
-            "use_feat_batchnorm": True,
-            "feature_dim": 80,
-            "nhead": 8,
-            "attention_dim": 512,
-            "num_decoder_layers": 6,
-            # parameters for decoding
-            "search_beam": 20,
-            "output_beam": 8,
-            "min_active_states": 30,
-            "max_active_states": 10000,
-            "use_double_scores": True,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    model: nn.Module,
-    HLG: Optional[k2.Fsa],
-    H: Optional[k2.Fsa],
-    bpe_model: Optional[spm.SentencePieceProcessor],
-    batch: dict,
-    word_table: k2.SymbolTable,
-    sos_id: int,
-    eos_id: int,
-    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[List[str]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if no rescoring is used, the key is the string `no_rescore`.
-               If LM rescoring is used, the key is the string `lm_scale_xxx`,
-               where `xxx` is the value of `lm_scale`. An example key is
-               `lm_scale_0.7`
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-
-        - params.method is "1best", it uses 1best decoding without LM rescoring.
-        - params.method is "nbest", it uses nbest decoding without LM rescoring.
-        - params.method is "nbest-rescoring", it uses nbest LM rescoring.
-        - params.method is "whole-lattice-rescoring", it uses whole lattice LM
-          rescoring.
-
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used only when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      bpe_model:
-        The BPE model. Used only when params.method is ctc-decoding.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      word_table:
-        The word symbol table.
-      sos_id:
-        The token ID of the SOS.
-      eos_id:
-        The token ID of the EOS.
-      G:
-        An LM. It is not None when params.method is "nbest-rescoring"
-        or "whole-lattice-rescoring". In general, the G in HLG
-        is a 3-gram LM, while this G is a 4-gram LM.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict. Note: If it decodes to nothing, then return None.
-    """
-    if HLG is not None:
-        device = HLG.device
-    else:
-        device = H.device
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-    feature = feature.to(device)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-
-    nnet_output, memory, memory_key_padding_mask = model(feature, supervisions)
-    # nnet_output is (N, T, C)
-
-    supervision_segments = torch.stack(
-        (
-            supervisions["sequence_idx"],
-            supervisions["start_frame"] // params.subsampling_factor,
-            supervisions["num_frames"] // params.subsampling_factor,
-        ),
-        1,
-    ).to(torch.int32)
-
-    if H is None:
-        assert HLG is not None
-        decoding_graph = HLG
-    else:
-        assert HLG is None
-        assert bpe_model is not None
-        decoding_graph = H
-
-    lattice = get_lattice(
-        nnet_output=nnet_output,
-        decoding_graph=decoding_graph,
-        supervision_segments=supervision_segments,
-        search_beam=params.search_beam,
-        output_beam=params.output_beam,
-        min_active_states=params.min_active_states,
-        max_active_states=params.max_active_states,
-        subsampling_factor=params.subsampling_factor,
-    )
-
-    if params.method == "ctc-decoding":
-        best_path = one_best_decoding(
-            lattice=lattice, use_double_scores=params.use_double_scores
-        )
-        # Note: `best_path.aux_labels` contains token IDs, not word IDs
-        # since we are using H, not HLG here.
-        #
-        # token_ids is a lit-of-list of IDs
-        token_ids = get_texts(best_path)
-
-        # hyps is a list of str, e.g., ['xxx yyy zzz', ...]
-        hyps = bpe_model.decode(token_ids)
-
-        # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
-        hyps = [s.split() for s in hyps]
-        key = "ctc-decoding"
-        return {key: hyps}
-
-    if params.method == "nbest-oracle":
-        # Note: You can also pass rescored lattices to it.
-        # We choose the HLG decoded lattice for speed reasons
-        # as HLG decoding is faster and the oracle WER
-        # is only slightly worse than that of rescored lattices.
-        best_path = nbest_oracle(
-            lattice=lattice,
-            num_paths=params.num_paths,
-            ref_texts=supervisions["text"],
-            word_table=word_table,
-            nbest_scale=params.nbest_scale,
-            oov="<UNK>",
-        )
-        hyps = get_texts(best_path)
-        hyps = [[word_table[i] for i in ids] for ids in hyps]
-        key = f"oracle_{params.num_paths}_nbest_scale_{params.nbest_scale}"  # noqa
-        return {key: hyps}
-
-    if params.method in ["1best", "nbest"]:
-        if params.method == "1best":
-            best_path = one_best_decoding(
-                lattice=lattice, use_double_scores=params.use_double_scores
-            )
-            key = "no_rescore"
-        else:
-            best_path = nbest_decoding(
-                lattice=lattice,
-                num_paths=params.num_paths,
-                use_double_scores=params.use_double_scores,
-                nbest_scale=params.nbest_scale,
-            )
-            key = f"no_rescore-nbest-scale-{params.nbest_scale}-{params.num_paths}"  # noqa
-
-        hyps = get_texts(best_path)
-        hyps = [[word_table[i] for i in ids] for ids in hyps]
-        return {key: hyps}
-
-    assert params.method in [
-        "nbest-rescoring",
-        "whole-lattice-rescoring",
-        "attention-decoder",
-    ]
-
-    lm_scale_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
-    lm_scale_list += [0.8, 0.9, 1.0, 1.1, 1.2, 1.3]
-    lm_scale_list += [1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
-
-    if params.method == "nbest-rescoring":
-        best_path_dict = rescore_with_n_best_list(
-            lattice=lattice,
-            G=G,
-            num_paths=params.num_paths,
-            lm_scale_list=lm_scale_list,
-            nbest_scale=params.nbest_scale,
-        )
-    elif params.method == "whole-lattice-rescoring":
-        best_path_dict = rescore_with_whole_lattice(
-            lattice=lattice,
-            G_with_epsilon_loops=G,
-            lm_scale_list=lm_scale_list,
-        )
-    elif params.method == "attention-decoder":
-        # lattice uses a 3-gram Lm. We rescore it with a 4-gram LM.
-        rescored_lattice = rescore_with_whole_lattice(
-            lattice=lattice,
-            G_with_epsilon_loops=G,
-            lm_scale_list=None,
-        )
-        # TODO: pass `lattice` instead of `rescored_lattice` to
-        # `rescore_with_attention_decoder`
-
-        best_path_dict = rescore_with_attention_decoder(
-            lattice=rescored_lattice,
-            num_paths=params.num_paths,
-            model=model,
-            memory=memory,
-            memory_key_padding_mask=memory_key_padding_mask,
-            sos_id=sos_id,
-            eos_id=eos_id,
-            nbest_scale=params.nbest_scale,
-        )
-    else:
-        assert False, f"Unsupported decoding method: {params.method}"
-
-    ans = dict()
-    if best_path_dict is not None:
-        for lm_scale_str, best_path in best_path_dict.items():
-            hyps = get_texts(best_path)
-            hyps = [[word_table[i] for i in ids] for ids in hyps]
-            ans[lm_scale_str] = hyps
-    else:
-        ans = None
-    return ans
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    model: nn.Module,
-    HLG: Optional[k2.Fsa],
-    H: Optional[k2.Fsa],
-    bpe_model: Optional[spm.SentencePieceProcessor],
-    word_table: k2.SymbolTable,
-    sos_id: int,
-    eos_id: int,
-    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used only when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      bpe_model:
-        The BPE model. Used only when params.method is ctc-decoding.
-      word_table:
-        It is the word symbol table.
-      sos_id:
-        The token ID for SOS.
-      eos_id:
-        The token ID for EOS.
-      G:
-        An LM. It is not None when params.method is "nbest-rescoring"
-        or "whole-lattice-rescoring". In general, the G in HLG
-        is a 3-gram LM.
-    Returns:
-      Return a dict, whose key may be "no-rescore" if no LM rescoring
-      is used, or it may be "lm_scale_0.7" if LM rescoring is used.
-      Its value is a list of tuples. Each tuple contains two elements:
-      The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            model=model,
-            HLG=HLG,
-            H=H,
-            bpe_model=bpe_model,
-            batch=batch,
-            word_table=word_table,
-            G=G,
-            sos_id=sos_id,
-            eos_id=eos_id,
-        )
-
-        if hyps_dict is not None:
-            for lm_scale, hyps in hyps_dict.items():
-                this_batch = []
-                assert len(hyps) == len(texts)
-                for hyp_words, ref_text in zip(hyps, texts):
-                    ref_words = ref_text.split()
-                    this_batch.append((ref_words, hyp_words))
-
-                results[lm_scale].extend(this_batch)
-        else:
-            assert len(results) > 0, "It should not decode to empty in the first batch!"
-            this_batch = []
-            hyp_words = []
-            for ref_text in texts:
-                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
-
-            for lm_scale in results.keys():
-                results[lm_scale].extend(this_batch)
-
-        num_cuts += len(texts)
-
-        if batch_idx % 100 == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
-):
-    if params.method == "attention-decoder":
-        # Set it to False since there are too many logs.
-        enable_log = False
-    else:
-        enable_log = True
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
-        store_transcripts(filename=recog_path, texts=results)
-        if enable_log:
-            logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}.txt"
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=enable_log
-            )
-            test_set_wers[key] = wer
-
-        if enable_log:
-            logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = params.exp_dir / f"wer-summary-{test_set_name}.txt"
-    with open(errs_info, "w") as f:
-        print("settings\tWER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    SPGISpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-    args.lang_dir = Path(args.lang_dir)
-    args.lm_dir = Path(args.lm_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode")
-    logging.info("Decoding started")
-    logging.info(params)
-
-    lexicon = Lexicon(params.lang_dir)
-    max_token_id = max(lexicon.tokens)
-    num_classes = max_token_id + 1  # +1 for the blank
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"device: {device}")
-
-    graph_compiler = BpeCtcTrainingGraphCompiler(
-        params.lang_dir,
-        device=device,
-        sos_token="<sos/eos>",
-        eos_token="<sos/eos>",
-    )
-    sos_id = graph_compiler.sos_id
-    eos_id = graph_compiler.eos_id
-
-    if params.method == "ctc-decoding":
-        HLG = None
-        H = k2.ctc_topo(
-            max_token=max_token_id,
-            modified=True,  # Use modified topology since vocab size is large
-            device=device,
-        )
-        bpe_model = spm.SentencePieceProcessor()
-        bpe_model.load(str(params.lang_dir / "bpe.model"))
-    else:
-        H = None
-        bpe_model = None
-        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
-        )
-        assert HLG.requires_grad is False
-
-        if not hasattr(HLG, "lm_scores"):
-            HLG.lm_scores = HLG.scores.clone()
-
-    if params.method in (
-        "nbest-rescoring",
-        "whole-lattice-rescoring",
-        "attention-decoder",
-    ):
-        if not (params.lm_dir / "G_3_gram.pt").is_file():
-            logging.info("Loading G_3_gram.fst.txt")
-            logging.warning("It may take 8 minutes.")
-            with open(params.lm_dir / "G_3_gram.fst.txt") as f:
-                first_word_disambig_id = lexicon.word_table["#0"]
-
-                G = k2.Fsa.from_openfst(f.read(), acceptor=False)
-                # G.aux_labels is not needed in later computations, so
-                # remove it here.
-                del G.aux_labels
-                # CAUTION: The following line is crucial.
-                # Arcs entering the back-off state have label equal to #0.
-                # We have to change it to 0 here.
-                G.labels[G.labels >= first_word_disambig_id] = 0
-                # See https://github.com/k2-fsa/k2/issues/874
-                # for why we need to set G.properties to None
-                G.__dict__["_properties"] = None
-                G = k2.Fsa.from_fsas([G]).to(device)
-                G = k2.arc_sort(G)
-                # Save a dummy value so that it can be loaded in C++.
-                # See https://github.com/pytorch/pytorch/issues/67902
-                # for why we need to do this.
-                G.dummy = 1
-
-                torch.save(G.as_dict(), params.lm_dir / "G_3_gram.pt")
-        else:
-            logging.info("Loading pre-compiled G_3_gram.pt")
-            d = torch.load(params.lm_dir / "G_3_gram.pt", map_location=device)
-            G = k2.Fsa.from_dict(d)
-
-        if params.method in ["whole-lattice-rescoring", "attention-decoder"]:
-            # Add epsilon self-loops to G as we will compose
-            # it with the whole lattice later
-            G = k2.add_epsilon_self_loops(G)
-            G = k2.arc_sort(G)
-            G = G.to(device)
-
-        # G.lm_scores is used to replace HLG.lm_scores during
-        # LM rescoring.
-        G.lm_scores = G.scores.clone()
-    else:
-        G = None
-
-    model = Conformer(
-        num_features=params.feature_dim,
-        nhead=params.nhead,
-        d_model=params.attention_dim,
-        num_classes=num_classes,
-        subsampling_factor=params.subsampling_factor,
-        num_decoder_layers=params.num_decoder_layers,
-        vgg_frontend=params.vgg_frontend,
-        use_feat_batchnorm=params.use_feat_batchnorm,
-    )
-
-    if params.avg == 1:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-
-    model.to(device)
-    model.eval()
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    spgispeech = SPGISpeechAsrDataModule(args)
-
-    dev_cuts = spgispeech.dev_cuts()
-    val_cuts = spgispeech.val_cuts()
-
-    dev_dl = spgispeech.test_dataloaders(dev_cuts)
-    val_dl = spgispeech.test_dataloaders(val_cuts)
-
-    test_sets = ["dev", "val"]
-    test_dl = [dev_dl, val_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dl):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            model=model,
-            HLG=HLG,
-            H=H,
-            bpe_model=bpe_model,
-            word_table=lexicon.word_table,
-            G=G,
-            sos_id=sos_id,
-            eos_id=eos_id,
-        )
-
-        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
-    logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/conformer_ctc/label_smoothing.py
+++ b/egs/spgispeech/ASR/conformer_ctc/label_smoothing.py
@ -1,101 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-
-class LabelSmoothingLoss(torch.nn.Module):
-    """
-    Implement the LabelSmoothingLoss proposed in the following paper
-    https://arxiv.org/pdf/1512.00567.pdf
-    (Rethinking the Inception Architecture for Computer Vision)
-
-    """
-
-    def __init__(
-        self,
-        ignore_index: int = -1,
-        label_smoothing: float = 0.1,
-        reduction: str = "sum",
-    ) -> None:
-        """
-        Args:
-          ignore_index:
-            ignored class id
-          label_smoothing:
-            smoothing rate (0.0 means the conventional cross entropy loss)
-          reduction:
-            It has the same meaning as the reduction in
-            `torch.nn.CrossEntropyLoss`. It can be one of the following three
-            values: (1) "none": No reduction will be applied. (2) "mean": the
-            mean of the output is taken. (3) "sum": the output will be summed.
-        """
-        super().__init__()
-        assert 0.0 <= label_smoothing < 1.0
-        self.ignore_index = ignore_index
-        self.label_smoothing = label_smoothing
-        self.reduction = reduction
-
-    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        """
-        Compute loss between x and target.
-
-        Args:
-          x:
-            prediction of dimension
-            (batch_size, input_length, number_of_classes).
-          target:
-            target masked with self.ignore_index of
-            dimension (batch_size, input_length).
-
-        Returns:
-          A scalar tensor containing the loss without normalization.
-        """
-        assert x.ndim == 3
-        assert target.ndim == 2
-        assert x.shape[:2] == target.shape
-        num_classes = x.size(-1)
-        x = x.reshape(-1, num_classes)
-        # Now x is of shape (N*T, C)
-
-        # We don't want to change target in-place below,
-        # so we make a copy of it here
-        target = target.clone().reshape(-1)
-
-        ignored = target == self.ignore_index
-        target = torch.where(ignored, torch.zeros_like(target), target)
-        # target[ignored] = 0
-
-        true_dist = torch.nn.functional.one_hot(target, num_classes=num_classes).to(x)
-
-        true_dist = (
-            true_dist * (1 - self.label_smoothing) + self.label_smoothing / num_classes
-        )
-        # Set the value of ignored indexes to 0
-        # true_dist[ignored] = 0
-        true_dist = torch.where(
-            ignored.unsqueeze(1).repeat(1, true_dist.shape[1]),
-            torch.zeros_like(true_dist),
-            true_dist,
-        )
-
-        loss = -1 * (torch.log_softmax(x, dim=1) * true_dist)
-        if self.reduction == "sum":
-            return loss.sum()
-        elif self.reduction == "mean":
-            return loss.sum() / (~ignored).sum()
-        else:
-            return loss.sum(dim=-1)
--- a/egs/spgispeech/ASR/conformer_ctc/pretrained.py
+++ b/egs/spgispeech/ASR/conformer_ctc/pretrained.py
@ -1,435 +0,0 @@
-#!/usr/bin/env python3
-# Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang,
-#                                                    Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import logging
-import math
-from typing import List
-
-import k2
-import kaldifeat
-import sentencepiece as spm
-import torch
-import torchaudio
-from conformer import Conformer
-from torch.nn.utils.rnn import pad_sequence
-
-from icefall.decode import (
-    get_lattice,
-    one_best_decoding,
-    rescore_with_attention_decoder,
-    rescore_with_whole_lattice,
-)
-from icefall.utils import AttributeDict, get_texts
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--checkpoint",
-        type=str,
-        required=True,
-        help="Path to the checkpoint. "
-        "The checkpoint is assumed to be saved by "
-        "icefall.checkpoint.save_checkpoint().",
-    )
-
-    parser.add_argument(
-        "--words-file",
-        type=str,
-        help="""Path to words.txt.
-        Used only when method is not ctc-decoding.
-        """,
-    )
-
-    parser.add_argument(
-        "--HLG",
-        type=str,
-        help="""Path to HLG.pt.
-        Used only when method is not ctc-decoding.
-        """,
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        help="""Path to bpe.model.
-        Used only when method is ctc-decoding.
-        """,
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="1best",
-        help="""Decoding method.
-        Possible values are:
-        (0) ctc-decoding - Use CTC decoding. It uses a sentence
-            piece model, i.e., lang_dir/bpe.model, to convert
-            word pieces to words. It needs neither a lexicon
-            nor an n-gram LM.
-        (1) 1best - Use the best path as decoding output. Only
-            the transformer encoder output is used for decoding.
-            We call it HLG decoding.
-        (2) whole-lattice-rescoring - Use an LM to rescore the
-            decoding lattice and then use 1best to decode the
-            rescored lattice.
-            We call it HLG decoding + n-gram LM rescoring.
-        (3) attention-decoder - Extract n paths from the rescored
-            lattice and use the transformer attention decoder for
-            rescoring.
-            We call it HLG decoding + n-gram LM rescoring + attention
-            decoder rescoring.
-        """,
-    )
-
-    parser.add_argument(
-        "--G",
-        type=str,
-        help="""An LM for rescoring.
-        Used only when method is
-        whole-lattice-rescoring or attention-decoder.
-        It's usually a 4-gram LM.
-        """,
-    )
-
-    parser.add_argument(
-        "--num-paths",
-        type=int,
-        default=100,
-        help="""
-        Used only when method is attention-decoder.
-        It specifies the size of n-best list.""",
-    )
-
-    parser.add_argument(
-        "--ngram-lm-scale",
-        type=float,
-        default=1.3,
-        help="""
-        Used only when method is whole-lattice-rescoring and attention-decoder.
-        It specifies the scale for n-gram LM scores.
-        (Note: You need to tune it on a dataset.)
-        """,
-    )
-
-    parser.add_argument(
-        "--attention-decoder-scale",
-        type=float,
-        default=1.2,
-        help="""
-        Used only when method is attention-decoder.
-        It specifies the scale for attention decoder scores.
-        (Note: You need to tune it on a dataset.)
-        """,
-    )
-
-    parser.add_argument(
-        "--nbest-scale",
-        type=float,
-        default=0.5,
-        help="""
-        Used only when method is attention-decoder.
-        It specifies the scale for lattice.scores when
-        extracting n-best lists. A smaller value results in
-        more unique number of paths with the risk of missing
-        the best path.
-        """,
-    )
-
-    parser.add_argument(
-        "--sos-id",
-        type=int,
-        default=1,
-        help="""
-        Used only when method is attention-decoder.
-        It specifies ID for the SOS token.
-        """,
-    )
-
-    parser.add_argument(
-        "--num-classes",
-        type=int,
-        default=500,
-        help="""
-        Vocab size in the BPE model.
-        """,
-    )
-
-    parser.add_argument(
-        "--eos-id",
-        type=int,
-        default=1,
-        help="""
-        Used only when method is attention-decoder.
-        It specifies ID for the EOS token.
-        """,
-    )
-
-    parser.add_argument(
-        "sound_files",
-        type=str,
-        nargs="+",
-        help="The input sound file(s) to transcribe. "
-        "Supported formats are those supported by torchaudio.load(). "
-        "For example, wav and flac are supported. "
-        "The sample rate has to be 16kHz.",
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            "sample_rate": 16000,
-            # parameters for conformer
-            "subsampling_factor": 4,
-            "vgg_frontend": False,
-            "use_feat_batchnorm": True,
-            "feature_dim": 80,
-            "nhead": 8,
-            "attention_dim": 512,
-            "num_decoder_layers": 6,
-            # parameters for decoding
-            "search_beam": 20,
-            "output_beam": 8,
-            "min_active_states": 30,
-            "max_active_states": 10000,
-            "use_double_scores": True,
-        }
-    )
-    return params
-
-
-def read_sound_files(
-    filenames: List[str], expected_sample_rate: float
-) -> List[torch.Tensor]:
-    """Read a list of sound files into a list 1-D float32 torch tensors.
-    Args:
-      filenames:
-        A list of sound filenames.
-      expected_sample_rate:
-        The expected sample rate of the sound files.
-    Returns:
-      Return a list of 1-D float32 torch tensors.
-    """
-    ans = []
-    for f in filenames:
-        wave, sample_rate = torchaudio.load(f)
-        assert sample_rate == expected_sample_rate, (
-            f"expected sample rate: {expected_sample_rate}. "
-            f"Given: {sample_rate}"
-        )
-        # We use only the first channel
-        ans.append(wave[0])
-    return ans
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    params = get_params()
-    if args.method != "attention-decoder":
-        # to save memory as the attention decoder
-        # will not be used
-        params.num_decoder_layers = 0
-
-    params.update(vars(args))
-    logging.info(f"{params}")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"device: {device}")
-
-    logging.info("Creating model")
-    model = Conformer(
-        num_features=params.feature_dim,
-        nhead=params.nhead,
-        d_model=params.attention_dim,
-        num_classes=params.num_classes,
-        subsampling_factor=params.subsampling_factor,
-        num_decoder_layers=params.num_decoder_layers,
-        vgg_frontend=params.vgg_frontend,
-        use_feat_batchnorm=params.use_feat_batchnorm,
-    )
-
-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
-    model.load_state_dict(checkpoint["model"], strict=False)
-    model.to(device)
-    model.eval()
-
-    logging.info("Constructing Fbank computer")
-    opts = kaldifeat.FbankOptions()
-    opts.device = device
-    opts.frame_opts.dither = 0
-    opts.frame_opts.snip_edges = False
-    opts.frame_opts.samp_freq = params.sample_rate
-    opts.mel_opts.num_bins = params.feature_dim
-
-    fbank = kaldifeat.Fbank(opts)
-
-    logging.info(f"Reading sound files: {params.sound_files}")
-    waves = read_sound_files(
-        filenames=params.sound_files, expected_sample_rate=params.sample_rate
-    )
-    waves = [w.to(device) for w in waves]
-
-    logging.info("Decoding started")
-    features = fbank(waves)
-
-    features = pad_sequence(
-        features, batch_first=True, padding_value=math.log(1e-10)
-    )
-
-    # Note: We don't use key padding mask for attention during decoding
-    with torch.no_grad():
-        nnet_output, memory, memory_key_padding_mask = model(features)
-
-    batch_size = nnet_output.shape[0]
-    supervision_segments = torch.tensor(
-        [[i, 0, nnet_output.shape[1]] for i in range(batch_size)],
-        dtype=torch.int32,
-    )
-
-    if params.method == "ctc-decoding":
-        logging.info("Use CTC decoding")
-        bpe_model = spm.SentencePieceProcessor()
-        bpe_model.load(params.bpe_model)
-        max_token_id = params.num_classes - 1
-
-        H = k2.ctc_topo(
-            max_token=max_token_id,
-            modified=False,
-            device=device,
-        )
-
-        lattice = get_lattice(
-            nnet_output=nnet_output,
-            decoding_graph=H,
-            supervision_segments=supervision_segments,
-            search_beam=params.search_beam,
-            output_beam=params.output_beam,
-            min_active_states=params.min_active_states,
-            max_active_states=params.max_active_states,
-            subsampling_factor=params.subsampling_factor,
-        )
-
-        best_path = one_best_decoding(
-            lattice=lattice, use_double_scores=params.use_double_scores
-        )
-        token_ids = get_texts(best_path)
-        hyps = bpe_model.decode(token_ids)
-        hyps = [s.split() for s in hyps]
-    elif params.method in [
-        "1best",
-        "whole-lattice-rescoring",
-        "attention-decoder",
-    ]:
-        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
-        HLG = HLG.to(device)
-        if not hasattr(HLG, "lm_scores"):
-            # For whole-lattice-rescoring and attention-decoder
-            HLG.lm_scores = HLG.scores.clone()
-
-        if params.method in [
-            "whole-lattice-rescoring",
-            "attention-decoder",
-        ]:
-            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
-            # Add epsilon self-loops to G as we will compose
-            # it with the whole lattice later
-            G = G.to(device)
-            G = k2.add_epsilon_self_loops(G)
-            G = k2.arc_sort(G)
-            G.lm_scores = G.scores.clone()
-
-        lattice = get_lattice(
-            nnet_output=nnet_output,
-            decoding_graph=HLG,
-            supervision_segments=supervision_segments,
-            search_beam=params.search_beam,
-            output_beam=params.output_beam,
-            min_active_states=params.min_active_states,
-            max_active_states=params.max_active_states,
-            subsampling_factor=params.subsampling_factor,
-        )
-
-        if params.method == "1best":
-            logging.info("Use HLG decoding")
-            best_path = one_best_decoding(
-                lattice=lattice, use_double_scores=params.use_double_scores
-            )
-        elif params.method == "whole-lattice-rescoring":
-            logging.info("Use HLG decoding + LM rescoring")
-            best_path_dict = rescore_with_whole_lattice(
-                lattice=lattice,
-                G_with_epsilon_loops=G,
-                lm_scale_list=[params.ngram_lm_scale],
-            )
-            best_path = next(iter(best_path_dict.values()))
-        elif params.method == "attention-decoder":
-            logging.info("Use HLG + LM rescoring + attention decoder rescoring")
-            rescored_lattice = rescore_with_whole_lattice(
-                lattice=lattice, G_with_epsilon_loops=G, lm_scale_list=None
-            )
-            best_path_dict = rescore_with_attention_decoder(
-                lattice=rescored_lattice,
-                num_paths=params.num_paths,
-                model=model,
-                memory=memory,
-                memory_key_padding_mask=memory_key_padding_mask,
-                sos_id=params.sos_id,
-                eos_id=params.eos_id,
-                nbest_scale=params.nbest_scale,
-                ngram_lm_scale=params.ngram_lm_scale,
-                attention_scale=params.attention_decoder_scale,
-            )
-            best_path = next(iter(best_path_dict.values()))
-
-        hyps = get_texts(best_path)
-        word_sym_table = k2.SymbolTable.from_file(params.words_file)
-        hyps = [[word_sym_table[i] for i in ids] for ids in hyps]
-    else:
-        raise ValueError(f"Unsupported decoding method: {params.method}")
-
-    s = "\n"
-    for filename, hyp in zip(params.sound_files, hyps):
-        words = " ".join(hyp)
-        s += f"{filename}:\n{words}\n\n"
-    logging.info(s)
-
-    logging.info("Decoding Done")
-
-
-if __name__ == "__main__":
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    main()
--- a/egs/spgispeech/ASR/conformer_ctc/subsampling.py
+++ b/egs/spgispeech/ASR/conformer_ctc/subsampling.py
@ -1,161 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-import torch.nn as nn
-
-
-class Conv2dSubsampling(nn.Module):
-    """Convolutional 2D subsampling (to 1/4 length).
-
-    Convert an input of shape (N, T, idim) to an output
-    with shape (N, T', odim), where
-    T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
-
-    It is based on
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
-    """
-
-    def __init__(self, idim: int, odim: int) -> None:
-        """
-        Args:
-          idim:
-            Input dim. The input shape is (N, T, idim).
-            Caution: It requires: T >=7, idim >=7
-          odim:
-            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
-        """
-        assert idim >= 7
-        super().__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                in_channels=1, out_channels=odim, kernel_size=3, stride=2
-            ),
-            nn.ReLU(),
-            nn.Conv2d(
-                in_channels=odim, out_channels=odim, kernel_size=3, stride=2
-            ),
-            nn.ReLU(),
-        )
-        self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-
-        Returns:
-          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
-        """
-        # On entry, x is (N, T, idim)
-        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
-        x = self.conv(x)
-        # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2)
-        b, c, t, f = x.size()
-        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
-        return x
-
-
-class VggSubsampling(nn.Module):
-    """Trying to follow the setup described in the following paper:
-    https://arxiv.org/pdf/1910.09799.pdf
-
-    This paper is not 100% explicit so I am guessing to some extent,
-    and trying to compare with other VGG implementations.
-
-    Convert an input of shape (N, T, idim) to an output
-    with shape (N, T', odim), where
-    T' = ((T-1)//2 - 1)//2, which approximates T' = T//4
-    """
-
-    def __init__(self, idim: int, odim: int) -> None:
-        """Construct a VggSubsampling object.
-
-        This uses 2 VGG blocks with 2 Conv2d layers each,
-        subsampling its input by a factor of 4 in the time dimensions.
-
-        Args:
-          idim:
-            Input dim. The input shape is (N, T, idim).
-            Caution: It requires: T >=7, idim >=7
-          odim:
-            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
-        """
-        super().__init__()
-
-        cur_channels = 1
-        layers = []
-        block_dims = [32, 64]
-
-        # The decision to use padding=1 for the 1st convolution, then padding=0
-        # for the 2nd and for the max-pooling, and ceil_mode=True, was driven by
-        # a back-compatibility concern so that the number of frames at the
-        # output would be equal to:
-        #  (((T-1)//2)-1)//2.
-        # We can consider changing this by using padding=1 on the
-        # 2nd convolution, so the num-frames at the output would be T//4.
-        for block_dim in block_dims:
-            layers.append(
-                torch.nn.Conv2d(
-                    in_channels=cur_channels,
-                    out_channels=block_dim,
-                    kernel_size=3,
-                    padding=1,
-                    stride=1,
-                )
-            )
-            layers.append(torch.nn.ReLU())
-            layers.append(
-                torch.nn.Conv2d(
-                    in_channels=block_dim,
-                    out_channels=block_dim,
-                    kernel_size=3,
-                    padding=0,
-                    stride=1,
-                )
-            )
-            layers.append(
-                torch.nn.MaxPool2d(
-                    kernel_size=2, stride=2, padding=0, ceil_mode=True
-                )
-            )
-            cur_channels = block_dim
-
-        self.layers = nn.Sequential(*layers)
-
-        self.out = nn.Linear(
-            block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-
-        Returns:
-          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
-        """
-        x = x.unsqueeze(1)
-        x = self.layers(x)
-        b, c, t, f = x.size()
-        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        return x
--- a/egs/spgispeech/ASR/conformer_ctc/train.py
+++ b/egs/spgispeech/ASR/conformer_ctc/train.py
@ -1,780 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
-#                                                  Wei Kang
-#                                                  Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from pathlib import Path
-from shutil import copyfile
-from typing import Optional, Tuple
-
-import k2
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from asr_datamodule import SPGISpeechAsrDataModule
-from conformer import Conformer
-from lhotse.utils import fix_random_seed
-from torch import Tensor
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.utils import clip_grad_norm_
-from torch.utils.tensorboard import SummaryWriter
-from transformer import Noam
-
-from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
-from icefall.checkpoint import load_checkpoint
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.dist import cleanup_dist, setup_dist
-from icefall.env import get_env_info
-from icefall.graph_compiler import CtcTrainingGraphCompiler
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    MetricsTracker,
-    encode_supervisions,
-    setup_logger,
-    str2bool,
-)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--world-size",
-        type=int,
-        default=1,
-        help="Number of GPUs for DDP training.",
-    )
-
-    parser.add_argument(
-        "--master-port",
-        type=int,
-        default=12354,
-        help="Master port to use for DDP training.",
-    )
-
-    parser.add_argument(
-        "--tensorboard",
-        type=str2bool,
-        default=True,
-        help="Should various information be logged in tensorboard.",
-    )
-
-    parser.add_argument(
-        "--num-epochs",
-        type=int,
-        default=20,
-        help="Number of epochs to train.",
-    )
-
-    parser.add_argument(
-        "--start-epoch",
-        type=int,
-        default=0,
-        help="""Resume training from from this epoch.
-        If it is positive, it will load checkpoint from
-        conformer_ctc/exp/epoch-{start_epoch-1}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="conformer_ctc/exp",
-        help="""The experiment dir.
-        It specifies the directory where all training related
-        files, e.g., checkpoints, log, etc, are saved
-        """,
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        default="data/lang_bpe_500",
-        help="""The lang dir
-        It contains language related input files such as
-        "lexicon.txt"
-        """,
-    )
-
-    parser.add_argument(
-        "--att-rate",
-        type=float,
-        default=0.8,
-        help="""The attention rate.
-        The total loss is (1 -  att_rate) * ctc_loss + att_rate * att_loss
-        """,
-    )
-
-    parser.add_argument(
-        "--num-decoder-layers",
-        type=int,
-        default=6,
-        help="""Number of decoder layer of transformer decoder.
-        Setting this to 0 will not create the decoder at all (pure CTC model)
-        """,
-    )
-
-    parser.add_argument(
-        "--lr-factor",
-        type=float,
-        default=5.0,
-        help="The lr_factor for Noam optimizer",
-    )
-
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="The seed for random generators intended for reproducibility",
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    """Return a dict containing training parameters.
-
-    All training related parameters that are not passed from the commandline
-    are saved in the variable `params`.
-
-    Commandline options are merged into `params` after they are parsed, so
-    you can also access them via `params`.
-
-    Explanation of options saved in `params`:
-
-        - best_train_loss: Best training loss so far. It is used to select
-                           the model that has the lowest training loss. It is
-                           updated during the training.
-
-        - best_valid_loss: Best validation loss so far. It is used to select
-                           the model that has the lowest validation loss. It is
-                           updated during the training.
-
-        - best_train_epoch: It is the epoch that has the best training loss.
-
-        - best_valid_epoch: It is the epoch that has the best validation loss.
-
-        - batch_idx_train: Used to writing statistics to tensorboard. It
-                           contains number of batches trained so far across
-                           epochs.
-
-        - log_interval:  Print training loss if batch_idx % log_interval` is 0
-
-        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
-        - valid_interval:  Run validation if batch_idx % valid_interval is 0
-
-        - feature_dim: The model input dim. It has to match the one used
-                       in computing features.
-
-        - subsampling_factor:  The subsampling factor for the model.
-
-        - use_feat_batchnorm: Normalization for the input features, can be a
-                              boolean indicating whether to do batch
-                              normalization, or a float which means just scaling
-                              the input features with this float value.
-                              If given a float value, we will remove batchnorm
-                              layer in `ConvolutionModule` as well.
-
-        - attention_dim: Hidden dim for multi-head attention model.
-
-        - head: Number of heads of multi-head attention model.
-
-        - num_decoder_layers: Number of decoder layer of transformer decoder.
-
-        - beam_size: It is used in k2.ctc_loss
-
-        - reduction: It is used in k2.ctc_loss
-
-        - use_double_scores: It is used in k2.ctc_loss
-
-        - weight_decay:  The weight_decay for the optimizer.
-
-        - warm_step: The warm_step for Noam optimizer.
-    """
-    params = AttributeDict(
-        {
-            "best_train_loss": float("inf"),
-            "best_valid_loss": float("inf"),
-            "best_train_epoch": -1,
-            "best_valid_epoch": -1,
-            "batch_idx_train": 0,
-            "log_interval": 100,
-            "reset_interval": 500,
-            "valid_interval": 25000,
-            # parameters for conformer
-            "feature_dim": 40,
-            "subsampling_factor": 4,
-            "use_feat_batchnorm": True,
-            "attention_dim": 512,
-            "nhead": 8,
-            # parameters for loss
-            "beam_size": 10,
-            "reduction": "sum",
-            "use_double_scores": True,
-            # parameters for Noam
-            "weight_decay": 1e-6,
-            "warm_step": 80000,
-            "env_info": get_env_info(),
-        }
-    )
-
-    return params
-
-
-def load_checkpoint_if_available(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
-) -> None:
-    """Load checkpoint from file.
-
-    If params.start_epoch is positive, it will load the checkpoint from
-    `params.start_epoch - 1`. Otherwise, this function does nothing.
-
-    Apart from loading state dict for `model`, `optimizer` and `scheduler`,
-    it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
-    and `best_valid_loss` in `params`.
-
-    Args:
-      params:
-        The return value of :func:`get_params`.
-      model:
-        The training model.
-      optimizer:
-        The optimizer that we are using.
-      scheduler:
-        The learning rate scheduler we are using.
-    Returns:
-      Return None.
-    """
-    if params.start_epoch <= 0:
-        return
-
-    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
-    saved_params = load_checkpoint(
-        filename,
-        model=model,
-        optimizer=optimizer,
-        scheduler=scheduler,
-    )
-
-    keys = [
-        "best_train_epoch",
-        "best_valid_epoch",
-        "batch_idx_train",
-        "best_train_loss",
-        "best_valid_loss",
-    ]
-    for k in keys:
-        params[k] = saved_params[k]
-
-    return saved_params
-
-
-def save_checkpoint(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
-    rank: int = 0,
-) -> None:
-    """Save model, optimizer, scheduler and training stats to file.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The training model.
-    """
-    if rank != 0:
-        return
-    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
-    save_checkpoint_impl(
-        filename=filename,
-        model=model,
-        params=params,
-        optimizer=optimizer,
-        scheduler=scheduler,
-        rank=rank,
-    )
-
-    if params.best_train_epoch == params.cur_epoch:
-        best_train_filename = params.exp_dir / "best-train-loss.pt"
-        copyfile(src=filename, dst=best_train_filename)
-
-    if params.best_valid_epoch == params.cur_epoch:
-        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
-        copyfile(src=filename, dst=best_valid_filename)
-
-
-def compute_loss(
-    params: AttributeDict,
-    model: nn.Module,
-    batch: dict,
-    graph_compiler: BpeCtcTrainingGraphCompiler,
-    is_training: bool,
-) -> Tuple[Tensor, MetricsTracker]:
-    """
-    Compute CTC loss given the model and its inputs.
-
-    Args:
-      params:
-        Parameters for training. See :func:`get_params`.
-      model:
-        The model for training. It is an instance of Conformer in our case.
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      graph_compiler:
-        It is used to build a decoding graph from a ctc topo and training
-        transcript. The training transcript is contained in the given `batch`,
-        while the ctc topo is built when this compiler is instantiated.
-      is_training:
-        True for training. False for validation. When it is True, this
-        function enables autograd during computation; when it is False, it
-        disables autograd.
-    """
-    device = graph_compiler.device
-    feature = batch["inputs"]
-    # at entry, feature is (N, T, C)
-    assert feature.ndim == 3
-    feature = feature.to(device)
-
-    supervisions = batch["supervisions"]
-    with torch.set_grad_enabled(is_training):
-        nnet_output, encoder_memory, memory_mask = model(feature, supervisions)
-        # nnet_output is (N, T, C)
-
-    # NOTE: We need `encode_supervisions` to sort sequences with
-    # different duration in decreasing order, required by
-    # `k2.intersect_dense` called in `k2.ctc_loss`
-    supervision_segments, texts = encode_supervisions(
-        supervisions, subsampling_factor=params.subsampling_factor
-    )
-
-    if isinstance(graph_compiler, BpeCtcTrainingGraphCompiler):
-        # Works with a BPE model
-        token_ids = graph_compiler.texts_to_ids(texts)
-        decoding_graph = graph_compiler.compile(token_ids)
-    elif isinstance(graph_compiler, CtcTrainingGraphCompiler):
-        # Works with a phone lexicon
-        decoding_graph = graph_compiler.compile(texts)
-    else:
-        raise ValueError(f"Unsupported type of graph compiler: {type(graph_compiler)}")
-
-    dense_fsa_vec = k2.DenseFsaVec(
-        nnet_output,
-        supervision_segments,
-        allow_truncate=params.subsampling_factor - 1,
-    )
-
-    ctc_loss = k2.ctc_loss(
-        decoding_graph=decoding_graph,
-        dense_fsa_vec=dense_fsa_vec,
-        output_beam=params.beam_size,
-        reduction=params.reduction,
-        use_double_scores=params.use_double_scores,
-    )
-
-    if params.att_rate != 0.0:
-        with torch.set_grad_enabled(is_training):
-            mmodel = model.module if hasattr(model, "module") else model
-            # Note: We need to generate an unsorted version of token_ids
-            # `encode_supervisions()` called above sorts text, but
-            # encoder_memory and memory_mask are not sorted, so we
-            # use an unsorted version `supervisions["text"]` to regenerate
-            # the token_ids
-            #
-            # See https://github.com/k2-fsa/icefall/issues/97
-            # for more details
-            unsorted_token_ids = graph_compiler.texts_to_ids(supervisions["text"])
-            att_loss = mmodel.decoder_forward(
-                encoder_memory,
-                memory_mask,
-                token_ids=unsorted_token_ids,
-                sos_id=graph_compiler.sos_id,
-                eos_id=graph_compiler.eos_id,
-            )
-        loss = (1.0 - params.att_rate) * ctc_loss + params.att_rate * att_loss
-    else:
-        loss = ctc_loss
-        att_loss = torch.tensor([0])
-
-    assert loss.requires_grad == is_training
-
-    info = MetricsTracker()
-    info["frames"] = supervision_segments[:, 2].sum().item()
-    info["ctc_loss"] = ctc_loss.detach().cpu().item()
-    if params.att_rate != 0.0:
-        info["att_loss"] = att_loss.detach().cpu().item()
-
-    info["loss"] = loss.detach().cpu().item()
-
-    return loss, info
-
-
-def compute_validation_loss(
-    params: AttributeDict,
-    model: nn.Module,
-    graph_compiler: BpeCtcTrainingGraphCompiler,
-    valid_dl: torch.utils.data.DataLoader,
-    world_size: int = 1,
-) -> MetricsTracker:
-    """Run the validation process."""
-    model.eval()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(valid_dl):
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            batch=batch,
-            graph_compiler=graph_compiler,
-            is_training=False,
-        )
-        assert loss.requires_grad is False
-        tot_loss = tot_loss + loss_info
-
-    if world_size > 1:
-        tot_loss.reduce(loss.device)
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    if loss_value < params.best_valid_loss:
-        params.best_valid_epoch = params.cur_epoch
-        params.best_valid_loss = loss_value
-
-    return tot_loss
-
-
-def train_one_epoch(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    graph_compiler: BpeCtcTrainingGraphCompiler,
-    train_dl: torch.utils.data.DataLoader,
-    valid_dl: torch.utils.data.DataLoader,
-    tb_writer: Optional[SummaryWriter] = None,
-    world_size: int = 1,
-) -> None:
-    """Train the model for one epoch.
-
-    The training loss from the mean of all frames is saved in
-    `params.train_loss`. It runs the validation process every
-    `params.valid_interval` batches.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The model for training.
-      optimizer:
-        The optimizer we are using.
-      graph_compiler:
-        It is used to convert transcripts to FSAs.
-      train_dl:
-        Dataloader for the training dataset.
-      valid_dl:
-        Dataloader for the validation dataset.
-      tb_writer:
-        Writer to write log messages to tensorboard.
-      world_size:
-        Number of nodes in DDP training. If it is 1, DDP is disabled.
-    """
-    model.train()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(train_dl):
-        params.batch_idx_train += 1
-        batch_size = len(batch["supervisions"]["text"])
-
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            batch=batch,
-            graph_compiler=graph_compiler,
-            is_training=True,
-        )
-        # summary stats
-        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-        # NOTE: We use reduction==sum and loss is computed over utterances
-        # in the batch and there is no normalization to it so far.
-
-        optimizer.zero_grad()
-        loss.backward()
-        clip_grad_norm_(model.parameters(), 5.0, 2.0)
-        optimizer.step()
-
-        if batch_idx % params.log_interval == 0:
-            logging.info(
-                f"Epoch {params.cur_epoch}, "
-                f"batch {batch_idx}, loss[{loss_info}], "
-                f"tot_loss[{tot_loss}], batch size: {batch_size}"
-            )
-
-        if batch_idx % params.log_interval == 0:
-
-            if tb_writer is not None:
-                loss_info.write_summary(
-                    tb_writer, "train/current_", params.batch_idx_train
-                )
-                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
-
-        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
-            logging.info("Computing validation loss")
-            valid_info = compute_validation_loss(
-                params=params,
-                model=model,
-                graph_compiler=graph_compiler,
-                valid_dl=valid_dl,
-                world_size=world_size,
-            )
-            model.train()
-            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
-            if tb_writer is not None:
-                valid_info.write_summary(
-                    tb_writer, "train/valid_", params.batch_idx_train
-                )
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    params.train_loss = loss_value
-    if params.train_loss < params.best_train_loss:
-        params.best_train_epoch = params.cur_epoch
-        params.best_train_loss = params.train_loss
-
-
-def run(rank, world_size, args):
-    """
-    Args:
-      rank:
-        It is a value between 0 and `world_size-1`, which is
-        passed automatically by `mp.spawn()` in :func:`main`.
-        The node with rank 0 is responsible for saving checkpoint.
-      world_size:
-        Number of GPUs for DDP training.
-      args:
-        The return value of get_parser().parse_args()
-    """
-    params = get_params()
-    params.update(vars(args))
-
-    fix_random_seed(params.seed)
-    if world_size > 1:
-        setup_dist(rank, world_size, params.master_port)
-
-    setup_logger(f"{params.exp_dir}/log/log-train")
-    logging.info("Training started")
-    logging.info(params)
-
-    if args.tensorboard and rank == 0:
-        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
-    else:
-        tb_writer = None
-
-    lexicon = Lexicon(params.lang_dir)
-    max_token_id = max(lexicon.tokens)
-    num_classes = max_token_id + 1  # +1 for the blank
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", rank)
-
-    if "lang_bpe" in str(params.lang_dir):
-        graph_compiler = BpeCtcTrainingGraphCompiler(
-            params.lang_dir,
-            device=device,
-            sos_token="<sos/eos>",
-            eos_token="<sos/eos>",
-        )
-    elif "lang_phone" in str(params.lang_dir):
-        assert params.att_rate == 0, (
-            "Attention decoder training does not support phone lang dirs "
-            "at this time due to a missing <sos/eos> symbol. Set --att-rate=0 "
-            "for pure CTC training when using a phone-based lang dir."
-        )
-        assert params.num_decoder_layers == 0, (
-            "Attention decoder training does not support phone lang dirs "
-            "at this time due to a missing <sos/eos> symbol. "
-            "Set --num-decoder-layers=0 for pure CTC training when using "
-            "a phone-based lang dir."
-        )
-        graph_compiler = CtcTrainingGraphCompiler(
-            lexicon,
-            device=device,
-        )
-        # Manually add the sos/eos ID with their default values
-        # from the BPE recipe which we're adapting here.
-        graph_compiler.sos_id = 1
-        graph_compiler.eos_id = 1
-    else:
-        raise ValueError(
-            f"Unsupported type of lang dir (we expected it to have "
-            f"'lang_bpe' or 'lang_phone' in its name): {params.lang_dir}"
-        )
-
-    logging.info("About to create model")
-    model = Conformer(
-        num_features=params.feature_dim,
-        nhead=params.nhead,
-        d_model=params.attention_dim,
-        num_classes=num_classes,
-        subsampling_factor=params.subsampling_factor,
-        num_decoder_layers=params.num_decoder_layers,
-        vgg_frontend=False,
-        use_feat_batchnorm=params.use_feat_batchnorm,
-    )
-
-    checkpoints = load_checkpoint_if_available(params=params, model=model)
-
-    model.to(device)
-    if world_size > 1:
-        model = DDP(model, device_ids=[rank])
-
-    optimizer = Noam(
-        model.parameters(),
-        model_size=params.attention_dim,
-        factor=params.lr_factor,
-        warm_step=params.warm_step,
-        weight_decay=params.weight_decay,
-    )
-
-    if checkpoints:
-        optimizer.load_state_dict(checkpoints["optimizer"])
-
-    spgispeech = SPGISpeechAsrDataModule(args)
-
-    train_cuts = spgispeech.train_cuts()
-    train_dl = spgispeech.train_dataloaders(train_cuts)
-
-    valid_cuts = spgispeech.dev_cuts()
-    valid_dl = spgispeech.valid_dataloaders(valid_cuts)
-
-    scan_pessimistic_batches_for_oom(
-        model=model,
-        train_dl=train_dl,
-        optimizer=optimizer,
-        graph_compiler=graph_compiler,
-        params=params,
-    )
-
-    for epoch in range(params.start_epoch, params.num_epochs):
-        fix_random_seed(params.seed + epoch)
-        train_dl.sampler.set_epoch(epoch)
-
-        cur_lr = optimizer._rate
-        if tb_writer is not None:
-            tb_writer.add_scalar("train/learning_rate", cur_lr, params.batch_idx_train)
-            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
-        if rank == 0:
-            logging.info("epoch {}, learning rate {}".format(epoch, cur_lr))
-
-        params.cur_epoch = epoch
-
-        train_one_epoch(
-            params=params,
-            model=model,
-            optimizer=optimizer,
-            graph_compiler=graph_compiler,
-            train_dl=train_dl,
-            valid_dl=valid_dl,
-            tb_writer=tb_writer,
-            world_size=world_size,
-        )
-
-        save_checkpoint(
-            params=params,
-            model=model,
-            optimizer=optimizer,
-            rank=rank,
-        )
-
-    logging.info("Done!")
-
-    if world_size > 1:
-        torch.distributed.barrier()
-        cleanup_dist()
-
-
-def scan_pessimistic_batches_for_oom(
-    model: nn.Module,
-    train_dl: torch.utils.data.DataLoader,
-    optimizer: torch.optim.Optimizer,
-    graph_compiler: BpeCtcTrainingGraphCompiler,
-    params: AttributeDict,
-):
-    from lhotse.dataset import find_pessimistic_batches
-
-    logging.info(
-        "Sanity check -- see if any of the batches in epoch 0 would cause OOM."
-    )
-    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
-    for criterion, cuts in batches.items():
-        batch = train_dl.dataset[cuts]
-        try:
-            optimizer.zero_grad()
-            loss, _ = compute_loss(
-                params=params,
-                model=model,
-                batch=batch,
-                graph_compiler=graph_compiler,
-                is_training=True,
-            )
-            loss.backward()
-            clip_grad_norm_(model.parameters(), 5.0, 2.0)
-            optimizer.step()
-        except RuntimeError as e:
-            if "CUDA out of memory" in str(e):
-                logging.error(
-                    "Your GPU ran out of memory with the current "
-                    "max_duration setting. We recommend decreasing "
-                    "max_duration and trying again.\n"
-                    f"Failing criterion: {criterion} "
-                    f"(={crit_values[criterion]}) ..."
-                )
-            raise
-
-
-def main():
-    parser = get_parser()
-    SPGISpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-    args.lang_dir = Path(args.lang_dir)
-
-    world_size = args.world_size
-    assert world_size >= 1
-    if world_size > 1:
-        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
-    else:
-        run(rank=0, world_size=1, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
--- a/egs/spgispeech/ASR/conformer_ctc/transformer.py
+++ b/egs/spgispeech/ASR/conformer_ctc/transformer.py
@ -1,928 +0,0 @@
-# Copyright    2021 University of Chinese Academy of Sciences (author: Han Zhu)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from label_smoothing import LabelSmoothingLoss
-from subsampling import Conv2dSubsampling, VggSubsampling
-from torch.nn.utils.rnn import pad_sequence
-
-# Note: TorchScript requires Dict/List/etc. to be fully typed.
-Supervisions = Dict[str, torch.Tensor]
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        num_features: int,
-        num_classes: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        num_decoder_layers: int = 6,
-        dropout: float = 0.1,
-        normalize_before: bool = True,
-        vgg_frontend: bool = False,
-        use_feat_batchnorm: Union[float, bool] = 0.1,
-    ) -> None:
-        """
-        Args:
-          num_features:
-            The input dimension of the model.
-          num_classes:
-            The output dimension of the model.
-          subsampling_factor:
-            Number of output frames is num_in_frames // subsampling_factor.
-            Currently, subsampling_factor MUST be 4.
-          d_model:
-            Attention dimension.
-          nhead:
-            Number of heads in multi-head attention.
-            Must satisfy d_model // nhead == 0.
-          dim_feedforward:
-            The output dimension of the feedforward layers in encoder/decoder.
-          num_encoder_layers:
-            Number of encoder layers.
-          num_decoder_layers:
-            Number of decoder layers.
-          dropout:
-            Dropout in encoder/decoder.
-          normalize_before:
-            If True, use pre-layer norm; False to use post-layer norm.
-          vgg_frontend:
-            True to use vgg style frontend for subsampling.
-          use_feat_batchnorm:
-            True to use batchnorm for the input layer.
-            Float value to scale the input layer.
-            False to do nothing.
-        """
-        super().__init__()
-        self.use_feat_batchnorm = use_feat_batchnorm
-        assert isinstance(use_feat_batchnorm, (float, bool))
-        if isinstance(use_feat_batchnorm, bool) and use_feat_batchnorm:
-            self.feat_batchnorm = nn.BatchNorm1d(num_features)
-
-        self.num_features = num_features
-        self.num_classes = num_classes
-        self.subsampling_factor = subsampling_factor
-        if subsampling_factor != 4:
-            raise NotImplementedError("Support only 'subsampling_factor=4'.")
-
-        # self.encoder_embed converts the input of shape (N, T, num_classes)
-        # to the shape (N, T//subsampling_factor, d_model).
-        # That is, it does two things simultaneously:
-        #   (1) subsampling: T -> T//subsampling_factor
-        #   (2) embedding: num_classes -> d_model
-        if vgg_frontend:
-            self.encoder_embed = VggSubsampling(num_features, d_model)
-        else:
-            self.encoder_embed = Conv2dSubsampling(num_features, d_model)
-
-        self.encoder_pos = PositionalEncoding(d_model, dropout)
-
-        encoder_layer = TransformerEncoderLayer(
-            d_model=d_model,
-            nhead=nhead,
-            dim_feedforward=dim_feedforward,
-            dropout=dropout,
-            normalize_before=normalize_before,
-        )
-
-        if normalize_before:
-            encoder_norm = nn.LayerNorm(d_model)
-        else:
-            encoder_norm = None
-
-        self.encoder = nn.TransformerEncoder(
-            encoder_layer=encoder_layer,
-            num_layers=num_encoder_layers,
-            norm=encoder_norm,
-        )
-
-        # TODO(fangjun): remove dropout
-        self.encoder_output_layer = nn.Sequential(
-            nn.Dropout(p=dropout), nn.Linear(d_model, num_classes)
-        )
-
-        if num_decoder_layers > 0:
-            self.decoder_num_class = (
-                self.num_classes
-            )  # bpe model already has sos/eos symbol
-
-            self.decoder_embed = nn.Embedding(
-                num_embeddings=self.decoder_num_class, embedding_dim=d_model
-            )
-            self.decoder_pos = PositionalEncoding(d_model, dropout)
-
-            decoder_layer = TransformerDecoderLayer(
-                d_model=d_model,
-                nhead=nhead,
-                dim_feedforward=dim_feedforward,
-                dropout=dropout,
-                normalize_before=normalize_before,
-            )
-
-            if normalize_before:
-                decoder_norm = nn.LayerNorm(d_model)
-            else:
-                decoder_norm = None
-
-            self.decoder = nn.TransformerDecoder(
-                decoder_layer=decoder_layer,
-                num_layers=num_decoder_layers,
-                norm=decoder_norm,
-            )
-
-            self.decoder_output_layer = torch.nn.Linear(d_model, self.decoder_num_class)
-
-            self.decoder_criterion = LabelSmoothingLoss()
-        else:
-            self.decoder_criterion = None
-
-    def forward(
-        self, x: torch.Tensor, supervision: Optional[Supervisions] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Args:
-          x:
-            The input tensor. Its shape is (N, T, C).
-          supervision:
-            Supervision in lhotse format.
-            See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32  # noqa
-            (CAUTION: It contains length information, i.e., start and number of
-             frames, before subsampling)
-
-        Returns:
-          Return a tuple containing 3 tensors:
-            - CTC output for ctc decoding. Its shape is (N, T, C)
-            - Encoder output with shape (T, N, C). It can be used as key and
-              value for the decoder.
-            - Encoder output padding mask. It can be used as
-              memory_key_padding_mask for the decoder. Its shape is (N, T).
-              It is None if `supervision` is None.
-        """
-        if isinstance(self.use_feat_batchnorm, bool) and self.use_feat_batchnorm:
-            x = x.permute(0, 2, 1)  # (N, T, C) -> (N, C, T)
-            x = self.feat_batchnorm(x)
-            x = x.permute(0, 2, 1)  # (N, C, T) -> (N, T, C)
-        if isinstance(self.use_feat_batchnorm, float):
-            x *= self.use_feat_batchnorm
-        encoder_memory, memory_key_padding_mask = self.run_encoder(x, supervision)
-        x = self.ctc_output(encoder_memory)
-        return x, encoder_memory, memory_key_padding_mask
-
-    def run_encoder(
-        self, x: torch.Tensor, supervisions: Optional[Supervisions] = None
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """Run the transformer encoder.
-
-        Args:
-          x:
-            The model input. Its shape is (N, T, C).
-          supervisions:
-            Supervision in lhotse format.
-            See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32  # noqa
-            CAUTION: It contains length information, i.e., start and number of
-            frames, before subsampling
-            It is read directly from the batch, without any sorting. It is used
-            to compute the encoder padding mask, which is used as memory key
-            padding mask for the decoder.
-        Returns:
-          Return a tuple with two tensors:
-            - The encoder output, with shape (T, N, C)
-            - encoder padding mask, with shape (N, T).
-              The mask is None if `supervisions` is None.
-              It is used as memory key padding mask in the decoder.
-        """
-        x = self.encoder_embed(x)
-        x = self.encoder_pos(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-        mask = encoder_padding_mask(x.size(0), supervisions)
-        mask = mask.to(x.device) if mask is not None else None
-        x = self.encoder(x, src_key_padding_mask=mask)  # (T, N, C)
-
-        return x, mask
-
-    def ctc_output(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-          x:
-            The output tensor from the transformer encoder.
-            Its shape is (T, N, C)
-
-        Returns:
-          Return a tensor that can be used for CTC decoding.
-          Its shape is (N, T, C)
-        """
-        x = self.encoder_output_layer(x)
-        x = x.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
-        x = nn.functional.log_softmax(x, dim=-1)  # (N, T, C)
-        return x
-
-    @torch.jit.export
-    def decoder_forward(
-        self,
-        memory: torch.Tensor,
-        memory_key_padding_mask: torch.Tensor,
-        token_ids: List[List[int]],
-        sos_id: int,
-        eos_id: int,
-    ) -> torch.Tensor:
-        """
-        Args:
-          memory:
-            It's the output of the encoder with shape (T, N, C)
-          memory_key_padding_mask:
-            The padding mask from the encoder.
-          token_ids:
-            A list-of-list IDs. Each sublist contains IDs for an utterance.
-            The IDs can be either phone IDs or word piece IDs.
-          sos_id:
-            sos token id
-          eos_id:
-            eos token id
-
-        Returns:
-            A scalar, the **sum** of label smoothing loss over utterances
-            in the batch without any normalization.
-        """
-        ys_in = add_sos(token_ids, sos_id=sos_id)
-        ys_in = [torch.tensor(y) for y in ys_in]
-        ys_in_pad = pad_sequence(ys_in, batch_first=True, padding_value=float(eos_id))
-
-        ys_out = add_eos(token_ids, eos_id=eos_id)
-        ys_out = [torch.tensor(y) for y in ys_out]
-        ys_out_pad = pad_sequence(ys_out, batch_first=True, padding_value=float(-1))
-
-        device = memory.device
-        ys_in_pad = ys_in_pad.to(device)
-        ys_out_pad = ys_out_pad.to(device)
-
-        tgt_mask = generate_square_subsequent_mask(ys_in_pad.shape[-1]).to(device)
-
-        tgt_key_padding_mask = decoder_padding_mask(ys_in_pad, ignore_id=eos_id)
-        # TODO: Use length information to create the decoder padding mask
-        # We set the first column to False since the first column in ys_in_pad
-        # contains sos_id, which is the same as eos_id in our current setting.
-        tgt_key_padding_mask[:, 0] = False
-
-        tgt = self.decoder_embed(ys_in_pad)  # (N, T) -> (N, T, C)
-        tgt = self.decoder_pos(tgt)
-        tgt = tgt.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-        pred_pad = self.decoder(
-            tgt=tgt,
-            memory=memory,
-            tgt_mask=tgt_mask,
-            tgt_key_padding_mask=tgt_key_padding_mask,
-            memory_key_padding_mask=memory_key_padding_mask,
-        )  # (T, N, C)
-        pred_pad = pred_pad.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
-        pred_pad = self.decoder_output_layer(pred_pad)  # (N, T, C)
-
-        decoder_loss = self.decoder_criterion(pred_pad, ys_out_pad)
-
-        return decoder_loss
-
-    @torch.jit.export
-    def decoder_nll(
-        self,
-        memory: torch.Tensor,
-        memory_key_padding_mask: torch.Tensor,
-        token_ids: List[torch.Tensor],
-        sos_id: int,
-        eos_id: int,
-    ) -> torch.Tensor:
-        """
-        Args:
-          memory:
-            It's the output of the encoder with shape (T, N, C)
-          memory_key_padding_mask:
-            The padding mask from the encoder.
-          token_ids:
-            A list-of-list IDs (e.g., word piece IDs).
-            Each sublist represents an utterance.
-          sos_id:
-            The token ID for SOS.
-          eos_id:
-            The token ID for EOS.
-        Returns:
-            A 2-D tensor of shape (len(token_ids), max_token_length)
-            representing the cross entropy loss (i.e., negative log-likelihood).
-        """
-        # The common part between this function and decoder_forward could be
-        # extracted as a separate function.
-        if isinstance(token_ids[0], torch.Tensor):
-            # This branch is executed by torchscript in C++.
-            # See https://github.com/k2-fsa/k2/pull/870
-            # https://github.com/k2-fsa/k2/blob/3c1c18400060415b141ccea0115fd4bf0ad6234e/k2/torch/bin/attention_rescore.cu#L286
-            token_ids = [tolist(t) for t in token_ids]
-
-        ys_in = add_sos(token_ids, sos_id=sos_id)
-        ys_in = [torch.tensor(y) for y in ys_in]
-        ys_in_pad = pad_sequence(ys_in, batch_first=True, padding_value=float(eos_id))
-
-        ys_out = add_eos(token_ids, eos_id=eos_id)
-        ys_out = [torch.tensor(y) for y in ys_out]
-        ys_out_pad = pad_sequence(ys_out, batch_first=True, padding_value=float(-1))
-
-        device = memory.device
-        ys_in_pad = ys_in_pad.to(device, dtype=torch.int64)
-        ys_out_pad = ys_out_pad.to(device, dtype=torch.int64)
-
-        tgt_mask = generate_square_subsequent_mask(ys_in_pad.shape[-1]).to(device)
-
-        tgt_key_padding_mask = decoder_padding_mask(ys_in_pad, ignore_id=eos_id)
-        # TODO: Use length information to create the decoder padding mask
-        # We set the first column to False since the first column in ys_in_pad
-        # contains sos_id, which is the same as eos_id in our current setting.
-        tgt_key_padding_mask[:, 0] = False
-
-        tgt = self.decoder_embed(ys_in_pad)  # (B, T) -> (B, T, F)
-        tgt = self.decoder_pos(tgt)
-        tgt = tgt.permute(1, 0, 2)  # (B, T, F) -> (T, B, F)
-        pred_pad = self.decoder(
-            tgt=tgt,
-            memory=memory,
-            tgt_mask=tgt_mask,
-            tgt_key_padding_mask=tgt_key_padding_mask,
-            memory_key_padding_mask=memory_key_padding_mask,
-        )  # (T, B, F)
-        pred_pad = pred_pad.permute(1, 0, 2)  # (T, B, F) -> (B, T, F)
-        pred_pad = self.decoder_output_layer(pred_pad)  # (B, T, F)
-        # nll: negative log-likelihood
-        nll = torch.nn.functional.cross_entropy(
-            pred_pad.view(-1, self.decoder_num_class),
-            ys_out_pad.view(-1),
-            ignore_index=-1,
-            reduction="none",
-        )
-
-        nll = nll.view(pred_pad.shape[0], -1)
-
-        return nll
-
-
-class TransformerEncoderLayer(nn.Module):
-    """
-    Modified from torch.nn.TransformerEncoderLayer.
-    Add support of normalize_before,
-    i.e., use layer_norm before the first block.
-
-    Args:
-      d_model:
-        the number of expected features in the input (required).
-      nhead:
-        the number of heads in the multiheadattention models (required).
-      dim_feedforward:
-        the dimension of the feedforward network model (default=2048).
-      dropout:
-        the dropout value (default=0.1).
-      activation:
-        the activation function of intermediate layer, relu or
-        gelu (default=relu).
-      normalize_before:
-        whether to use layer_norm before the first block.
-
-    Examples::
-        >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
-        >>> src = torch.rand(10, 32, 512)
-        >>> out = encoder_layer(src)
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        normalize_before: bool = True,
-    ) -> None:
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-
-        self.normalize_before = normalize_before
-
-    def __setstate__(self, state):
-        if "activation" not in state:
-            state["activation"] = nn.functional.relu
-        super(TransformerEncoderLayer, self).__setstate__(state)
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        src_mask: Optional[torch.Tensor] = None,
-        src_key_padding_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Pass the input through the encoder layer.
-
-        Args:
-            src: the sequence to the encoder layer (required).
-            src_mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional)
-
-        Shape:
-            src: (S, N, E).
-            src_mask: (S, S).
-            src_key_padding_mask: (N, S).
-            S is the source sequence length, T is the target sequence length,
-            N is the batch size, E is the feature number
-        """
-        residual = src
-        if self.normalize_before:
-            src = self.norm1(src)
-        src2 = self.self_attn(
-            src,
-            src,
-            src,
-            attn_mask=src_mask,
-            key_padding_mask=src_key_padding_mask,
-        )[0]
-        src = residual + self.dropout1(src2)
-        if not self.normalize_before:
-            src = self.norm1(src)
-
-        residual = src
-        if self.normalize_before:
-            src = self.norm2(src)
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = residual + self.dropout2(src2)
-        if not self.normalize_before:
-            src = self.norm2(src)
-        return src
-
-
-class TransformerDecoderLayer(nn.Module):
-    """
-    Modified from torch.nn.TransformerDecoderLayer.
-    Add support of normalize_before,
-    i.e., use layer_norm before the first block.
-
-    Args:
-      d_model:
-        the number of expected features in the input (required).
-      nhead:
-        the number of heads in the multiheadattention models (required).
-      dim_feedforward:
-        the dimension of the feedforward network model (default=2048).
-      dropout:
-        the dropout value (default=0.1).
-      activation:
-        the activation function of intermediate layer, relu or
-        gelu (default=relu).
-
-    Examples::
-        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
-        >>> memory = torch.rand(10, 32, 512)
-        >>> tgt = torch.rand(20, 32, 512)
-        >>> out = decoder_layer(tgt, memory)
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        normalize_before: bool = True,
-    ) -> None:
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
-        self.src_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.dropout3 = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-
-        self.normalize_before = normalize_before
-
-    def __setstate__(self, state):
-        if "activation" not in state:
-            state["activation"] = nn.functional.relu
-        super(TransformerDecoderLayer, self).__setstate__(state)
-
-    def forward(
-        self,
-        tgt: torch.Tensor,
-        memory: torch.Tensor,
-        tgt_mask: Optional[torch.Tensor] = None,
-        memory_mask: Optional[torch.Tensor] = None,
-        tgt_key_padding_mask: Optional[torch.Tensor] = None,
-        memory_key_padding_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Pass the inputs (and mask) through the decoder layer.
-
-        Args:
-          tgt:
-            the sequence to the decoder layer (required).
-          memory:
-            the sequence from the last layer of the encoder (required).
-          tgt_mask:
-            the mask for the tgt sequence (optional).
-          memory_mask:
-            the mask for the memory sequence (optional).
-          tgt_key_padding_mask:
-            the mask for the tgt keys per batch (optional).
-          memory_key_padding_mask:
-            the mask for the memory keys per batch (optional).
-
-        Shape:
-            tgt: (T, N, E).
-            memory: (S, N, E).
-            tgt_mask: (T, T).
-            memory_mask: (T, S).
-            tgt_key_padding_mask: (N, T).
-            memory_key_padding_mask: (N, S).
-            S is the source sequence length, T is the target sequence length,
-            N is the batch size, E is the feature number
-        """
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm1(tgt)
-        tgt2 = self.self_attn(
-            tgt,
-            tgt,
-            tgt,
-            attn_mask=tgt_mask,
-            key_padding_mask=tgt_key_padding_mask,
-        )[0]
-        tgt = residual + self.dropout1(tgt2)
-        if not self.normalize_before:
-            tgt = self.norm1(tgt)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm2(tgt)
-        tgt2 = self.src_attn(
-            tgt,
-            memory,
-            memory,
-            attn_mask=memory_mask,
-            key_padding_mask=memory_key_padding_mask,
-        )[0]
-        tgt = residual + self.dropout2(tgt2)
-        if not self.normalize_before:
-            tgt = self.norm2(tgt)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm3(tgt)
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = residual + self.dropout3(tgt2)
-        if not self.normalize_before:
-            tgt = self.norm3(tgt)
-        return tgt
-
-
-def _get_activation_fn(activation: str):
-    if activation == "relu":
-        return nn.functional.relu
-    elif activation == "gelu":
-        return nn.functional.gelu
-
-    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
-
-
-class PositionalEncoding(nn.Module):
-    """This class implements the positional encoding
-    proposed in the following paper:
-
-    - Attention Is All You Need: https://arxiv.org/pdf/1706.03762.pdf
-
-        PE(pos, 2i) = sin(pos / (10000^(2i/d_modle))
-        PE(pos, 2i+1) = cos(pos / (10000^(2i/d_modle))
-
-    Note::
-
-      1 / (10000^(2i/d_model)) = exp(-log(10000^(2i/d_model)))
-                               = exp(-1* 2i / d_model * log(100000))
-                               = exp(2i * -(log(10000) / d_model))
-    """
-
-    def __init__(self, d_model: int, dropout: float = 0.1) -> None:
-        """
-        Args:
-          d_model:
-            Embedding dimension.
-          dropout:
-            Dropout probability to be applied to the output of this module.
-        """
-        super().__init__()
-        self.d_model = d_model
-        self.xscale = math.sqrt(self.d_model)
-        self.dropout = nn.Dropout(p=dropout)
-        # not doing: self.pe = None because of errors thrown by torchscript
-        self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32)
-
-    def extend_pe(self, x: torch.Tensor) -> None:
-        """Extend the time t in the positional encoding if required.
-
-        The shape of `self.pe` is (1, T1, d_model). The shape of the input x
-        is (N, T, d_model). If T > T1, then we change the shape of self.pe
-        to (N, T, d_model). Otherwise, nothing is done.
-
-        Args:
-          x:
-            It is a tensor of shape (N, T, C).
-        Returns:
-          Return None.
-        """
-        if self.pe is not None:
-            if self.pe.size(1) >= x.size(1):
-                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
-        pe = torch.zeros(x.size(1), self.d_model, dtype=torch.float32)
-        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.d_model)
-        )
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0)
-        # Now pe is of shape (1, T, d_model), where T is x.size(1)
-        self.pe = pe.to(device=x.device, dtype=x.dtype)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Add positional encoding.
-
-        Args:
-          x:
-            Its shape is (N, T, C)
-
-        Returns:
-          Return a tensor of shape (N, T, C)
-        """
-        self.extend_pe(x)
-        x = x * self.xscale + self.pe[:, : x.size(1), :]
-        return self.dropout(x)
-
-
-class Noam(object):
-    """
-    Implements Noam optimizer.
-
-    Proposed in
-    "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
-
-    Modified from
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py  # noqa
-
-    Args:
-      params:
-        iterable of parameters to optimize or dicts defining parameter groups
-      model_size:
-        attention dimension of the transformer model
-      factor:
-        learning rate factor
-      warm_step:
-        warmup steps
-    """
-
-    def __init__(
-        self,
-        params,
-        model_size: int = 256,
-        factor: float = 10.0,
-        warm_step: int = 25000,
-        weight_decay=0,
-    ) -> None:
-        """Construct an Noam object."""
-        self.optimizer = torch.optim.Adam(
-            params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay
-        )
-        self._step = 0
-        self.warmup = warm_step
-        self.factor = factor
-        self.model_size = model_size
-        self._rate = 0
-
-    @property
-    def param_groups(self):
-        """Return param_groups."""
-        return self.optimizer.param_groups
-
-    def step(self):
-        """Update parameters and rate."""
-        self._step += 1
-        rate = self.rate()
-        for p in self.optimizer.param_groups:
-            p["lr"] = rate
-        self._rate = rate
-        self.optimizer.step()
-
-    def rate(self, step=None):
-        """Implement `lrate` above."""
-        if step is None:
-            step = self._step
-        return (
-            self.factor
-            * self.model_size ** (-0.5)
-            * min(step ** (-0.5), step * self.warmup ** (-1.5))
-        )
-
-    def zero_grad(self):
-        """Reset gradient."""
-        self.optimizer.zero_grad()
-
-    def state_dict(self):
-        """Return state_dict."""
-        return {
-            "_step": self._step,
-            "warmup": self.warmup,
-            "factor": self.factor,
-            "model_size": self.model_size,
-            "_rate": self._rate,
-            "optimizer": self.optimizer.state_dict(),
-        }
-
-    def load_state_dict(self, state_dict):
-        """Load state_dict."""
-        for key, value in state_dict.items():
-            if key == "optimizer":
-                self.optimizer.load_state_dict(state_dict["optimizer"])
-            else:
-                setattr(self, key, value)
-
-
-def encoder_padding_mask(
-    max_len: int, supervisions: Optional[Supervisions] = None
-) -> Optional[torch.Tensor]:
-    """Make mask tensor containing indexes of padded part.
-
-    TODO::
-      This function **assumes** that the model uses
-      a subsampling factor of 4. We should remove that
-      assumption later.
-
-    Args:
-      max_len:
-        Maximum length of input features.
-        CAUTION: It is the length after subsampling.
-      supervisions:
-        Supervision in lhotse format.
-        See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32  # noqa
-        (CAUTION: It contains length information, i.e., start and number of
-         frames, before subsampling)
-
-    Returns:
-        Tensor: Mask tensor of dimension (batch_size, input_length),
-        True denote the masked indices.
-    """
-    if supervisions is None:
-        return None
-
-    supervision_segments = torch.stack(
-        (
-            supervisions["sequence_idx"],
-            supervisions["start_frame"],
-            supervisions["num_frames"],
-        ),
-        1,
-    ).to(torch.int32)
-
-    lengths = [0 for _ in range(int(supervision_segments[:, 0].max().item()) + 1)]
-    for idx in range(supervision_segments.size(0)):
-        # Note: TorchScript doesn't allow to unpack tensors as tuples
-        sequence_idx = supervision_segments[idx, 0].item()
-        start_frame = supervision_segments[idx, 1].item()
-        num_frames = supervision_segments[idx, 2].item()
-        lengths[sequence_idx] = start_frame + num_frames
-
-    lengths = [((i - 1) // 2 - 1) // 2 for i in lengths]
-    bs = int(len(lengths))
-    seq_range = torch.arange(0, max_len, dtype=torch.int64)
-    seq_range_expand = seq_range.unsqueeze(0).expand(bs, max_len)
-    # Note: TorchScript doesn't implement Tensor.new()
-    seq_length_expand = torch.tensor(
-        lengths, device=seq_range_expand.device, dtype=seq_range_expand.dtype
-    ).unsqueeze(-1)
-    mask = seq_range_expand >= seq_length_expand
-
-    return mask
-
-
-def decoder_padding_mask(ys_pad: torch.Tensor, ignore_id: int = -1) -> torch.Tensor:
-    """Generate a length mask for input.
-
-    The masked position are filled with True,
-    Unmasked positions are filled with False.
-
-    Args:
-      ys_pad:
-        padded tensor of dimension (batch_size, input_length).
-      ignore_id:
-        the ignored number (the padding number) in ys_pad
-
-    Returns:
-      Tensor:
-        a bool tensor of the same shape as the input tensor.
-    """
-    ys_mask = ys_pad == ignore_id
-    return ys_mask
-
-
-def generate_square_subsequent_mask(sz: int) -> torch.Tensor:
-    """Generate a square mask for the sequence. The masked positions are
-    filled with float('-inf'). Unmasked positions are filled with float(0.0).
-    The mask can be used for masked self-attention.
-
-    For instance, if sz is 3, it returns::
-
-        tensor([[0., -inf, -inf],
-                [0., 0., -inf],
-                [0., 0., 0]])
-
-    Args:
-      sz: mask size
-
-    Returns:
-      A square mask of dimension (sz, sz)
-    """
-    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-    mask = (
-        mask.float()
-        .masked_fill(mask == 0, float("-inf"))
-        .masked_fill(mask == 1, float(0.0))
-    )
-    return mask
-
-
-def add_sos(token_ids: List[List[int]], sos_id: int) -> List[List[int]]:
-    """Prepend sos_id to each utterance.
-
-    Args:
-      token_ids:
-        A list-of-list of token IDs. Each sublist contains
-        token IDs (e.g., word piece IDs) of an utterance.
-      sos_id:
-        The ID of the SOS token.
-
-    Return:
-      Return a new list-of-list, where each sublist starts
-      with SOS ID.
-    """
-    return [[sos_id] + utt for utt in token_ids]
-
-
-def add_eos(token_ids: List[List[int]], eos_id: int) -> List[List[int]]:
-    """Append eos_id to each utterance.
-
-    Args:
-      token_ids:
-        A list-of-list of token IDs. Each sublist contains
-        token IDs (e.g., word piece IDs) of an utterance.
-      eos_id:
-        The ID of the EOS token.
-
-    Return:
-      Return a new list-of-list, where each sublist ends
-      with EOS ID.
-    """
-    return [utt + [eos_id] for utt in token_ids]
-
-
-def tolist(t: torch.Tensor) -> List[int]:
-    """Used by jit"""
-    return torch.jit.annotate(List[int], t.tolist())
--- a/egs/spgispeech/ASR/local/display_manifest_statistics.py
+++ b/egs/spgispeech/ASR/local/display_manifest_statistics.py
@ -1,215 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This file displays duration statistics of utterances in a manifest.
-You can use the displayed value to choose minimum/maximum duration
-to remove short and long utterances during the training.
-
-See the function `remove_short_and_long_utt()` in transducer/train.py
-for usage.
-"""
-
-
-from lhotse import load_manifest
-
-
-def main():
-    path = "./data/fbank/cuts_train-clean-100.json.gz"
-    path = "./data/fbank/cuts_train-clean-360.json.gz"
-    path = "./data/fbank/cuts_train-other-500.json.gz"
-    path = "./data/fbank/cuts_dev-clean.json.gz"
-    path = "./data/fbank/cuts_dev-other.json.gz"
-    path = "./data/fbank/cuts_test-clean.json.gz"
-    path = "./data/fbank/cuts_test-other.json.gz"
-
-    cuts = load_manifest(path)
-    cuts.describe()
-
-
-if __name__ == "__main__":
-    main()
-
-"""
-## train-clean-100
-Cuts count: 85617
-Total duration (hours): 303.8
-Speech duration (hours): 303.8 (100.0%)
-***
-Duration statistics (seconds):
-mean    12.8
-std     3.8
-min     1.3
-0.1%    1.9
-0.5%    2.2
-1%      2.5
-5%      4.2
-10%     6.4
-25%     11.4
-50%     13.8
-75%     15.3
-90%     16.7
-95%     17.3
-99%     18.1
-99.5%   18.4
-99.9%   18.8
-max     27.2
-
-## train-clean-360
-Cuts count: 312042
-Total duration (hours): 1098.2
-Speech duration (hours): 1098.2 (100.0%)
-***
-Duration statistics (seconds):
-mean    12.7
-std     3.8
-min     1.0
-0.1%    1.8
-0.5%    2.2
-1%      2.5
-5%      4.2
-10%     6.2
-25%     11.2
-50%     13.7
-75%     15.3
-90%     16.6
-95%     17.3
-99%     18.1
-99.5%   18.4
-99.9%   18.8
-max     33.0
-
-## train-other 500
-Cuts count: 446064
-Total duration (hours): 1500.6
-Speech duration (hours): 1500.6 (100.0%)
-***
-Duration statistics (seconds):
-mean    12.1
-std     4.2
-min     0.8
-0.1%    1.7
-0.5%    2.1
-1%      2.3
-5%      3.5
-10%     5.0
-25%     9.8
-50%     13.4
-75%     15.1
-90%     16.5
-95%     17.2
-99%     18.1
-99.5%   18.4
-99.9%   18.9
-max     31.0
-
-## dev-clean
-Cuts count: 2703
-Total duration (hours): 5.4
-Speech duration (hours): 5.4 (100.0%)
-***
-Duration statistics (seconds):
-mean    7.2
-std     4.7
-min     1.4
-0.1%    1.6
-0.5%    1.8
-1%      1.9
-5%      2.4
-10%     2.7
-25%     3.8
-50%     5.9
-75%     9.3
-90%     13.3
-95%     16.4
-99%     23.8
-99.5%   28.5
-99.9%   32.3
-max     32.6
-
-## dev-other
-Cuts count: 2864
-Total duration (hours): 5.1
-Speech duration (hours): 5.1 (100.0%)
-***
-Duration statistics (seconds):
-mean    6.4
-std     4.3
-min     1.1
-0.1%    1.3
-0.5%    1.7
-1%      1.8
-5%      2.2
-10%     2.6
-25%     3.5
-50%     5.3
-75%     7.9
-90%     12.0
-95%     15.0
-99%     22.2
-99.5%   27.1
-99.9%   32.4
-max     35.2
-
-## test-clean
-Cuts count: 2620
-Total duration (hours): 5.4
-Speech duration (hours): 5.4 (100.0%)
-***
-Duration statistics (seconds):
-mean    7.4
-std     5.2
-min     1.3
-0.1%    1.6
-0.5%    1.8
-1%      2.0
-5%      2.3
-10%     2.7
-25%     3.7
-50%     5.8
-75%     9.6
-90%     14.6
-95%     17.8
-99%     25.5
-99.5%   28.4
-99.9%   32.8
-max     35.0
-
-## test-other
-Cuts count: 2939
-Total duration (hours): 5.3
-Speech duration (hours): 5.3 (100.0%)
-***
-Duration statistics (seconds):
-mean    6.5
-std     4.4
-min     1.2
-0.1%    1.5
-0.5%    1.8
-1%      1.9
-5%      2.3
-10%     2.6
-25%     3.4
-50%     5.2
-75%     8.2
-90%     12.6
-95%     15.8
-99%     21.4
-99.5%   23.8
-99.9%   33.5
-max     34.5
-"""
--- a/egs/spgispeech/ASR/prepare.sh
+++ b/egs/spgispeech/ASR/prepare.sh
@ -14,17 +14,6 @@ stop_stage=100
 #      You can find train.csv, val.csv, train, and val in this directory, which belong
 #      to the SPGISpeech dataset.
 #
-#  - $dl_dir/lm
-#      This directory contains the following files downloaded from
-#       http://www.openslr.org/resources/11
-#
-#        - 3-gram.pruned.1e-7.arpa.gz
-#        - 3-gram.pruned.1e-7.arpa
-#        - 4-gram.arpa.gz
-#        - 4-gram.arpa
-#        - librispeech-vocab.txt
-#        - librispeech-lexicon.txt
-#
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
 #       http://www.openslr.org/17/
@ -55,12 +44,6 @@ log() {

 log "dl_dir: $dl_dir"

-if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
-  log "Stage -1: Download LM"
-  [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
-  ./local/download_lm.py --out-dir=$dl_dir/lm
-fi
-
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"

@ -71,7 +54,6 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  #
  if [ ! -d $dl_dir/spgispeech/train.csv ]; then
    lhotse download spgispeech $dl_dir
-    exit 1
  fi

  # If you have pre-downloaded it to /path/to/musan,
@ -110,14 +92,12 @@ fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank features for spgispeech dev and val"
  mkdir -p data/fbank
-  queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf exp/extract_feats_dev_val.log \
  python local/compute_fbank_spgispeech.py --test
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Compute fbank features for train"
  mkdir -p data/fbank
-  queue-freegpu.pl --mem 2G --gpu 1  --config conf/gpu.conf exp/extract_feats_train.log \
  python local/compute_fbank_spgispeech.py --train --num-splits 20

  log "Combine features from train splits (may take ~1h)"
@ -125,12 +105,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
    pieces=$(find data/manifests -name "cuts_train_[0-9]*.jsonl.gz")
    lhotse combine $pieces data/manifests/cuts_train.jsonl.gz
  fi
+  gunzip -c data/manifests/train_cuts.jsonl.gz | shuf | gzip -c > data/manifests/train_cuts_shuf.jsonl.gz
 fi

 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Compute fbank features for musan"
  mkdir -p data/fbank
-  queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf exp/extract_feats_musan.log \
  python local/compute_fbank_musan.py
 fi

--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -1 +0,0 @@
-../conformer_ctc/asr_datamodule.py
--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -0,0 +1,355 @@
+# Copyright      2021  Piotr Żelasko
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional, Dict, Any
+
+from tqdm import tqdm
+
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse.dataset import (
+    CutMix,
+    CutConcatenate,
+    DynamicBucketingSampler,
+    K2SpeechRecognitionDataset,
+    PrecomputedFeatures,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+import torch
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class SPGISpeechAsrDataModule:
+    """
+    DataModule for k2 ASR experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - cut concatenation,
+    - augmentation,
+    - on-the-fly feature extraction
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/manifests"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it "
+            "with training dataset. ",
+        )
+        group.add_argument(
+            "--concatenate-cuts",
+            type=str2bool,
+            default=False,
+            help="When enabled, utterances (cuts) will be concatenated "
+            "to minimize the amount of padding.",
+        )
+        group.add_argument(
+            "--duration-factor",
+            type=float,
+            default=1.0,
+            help="Determines the maximum duration of a concatenated cut "
+            "relative to the duration of the longest cut in a batch.",
+        )
+        group.add_argument(
+            "--gap",
+            type=float,
+            default=1.0,
+            help="The amount of padding (in seconds) inserted between "
+            "concatenated cuts. This padding is filled with noise when "
+            "noise augmentation is used.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=100.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the BucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help="When enabled, use on-the-fly cut mixing and feature "
+            "extraction. Will drop existing precomputed feature manifests "
+            "if available.",
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=8,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        logging.info("About to get Musan cuts")
+        cuts_musan = load_manifest(self.args.manifest_dir / "cuts_musan.jsonl.gz")
+
+        transforms = []
+        if self.args.enable_musan:
+            logging.info("Enable MUSAN")
+            transforms.append(
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        if self.args.concatenate_cuts:
+            logging.info(
+                f"Using cut concatenation with duration factor "
+                f"{self.args.duration_factor} and gap {self.args.gap}."
+            )
+            # Cut concatenation should be the first transform in the list,
+            # so that if we e.g. mix noise in, it will fill the gaps between
+            # different utterances.
+            transforms = [
+                CutConcatenate(
+                    duration_factor=self.args.duration_factor, gap=self.args.gap
+                )
+            ] + transforms
+
+        input_transforms = []
+        if self.args.enable_spec_aug:
+            logging.info("Enable SpecAugment")
+            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=2,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        if self.args.on_the_fly_feats:
+            train = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+                input_transforms=input_transforms,
+            )
+        else:
+            train = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                input_transforms=input_transforms,
+            )
+
+        logging.info("Using DynamicBucketingSampler.")
+        train_sampler = DynamicBucketingSampler(
+            cuts_train,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+            num_buckets=self.args.num_buckets,
+            drop_last=True,
+        )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+
+        transforms = []
+        if self.args.concatenate_cuts:
+            transforms = [
+                CutConcatenate(
+                    duration_factor=self.args.duration_factor, gap=self.args.gap
+                )
+            ] + transforms
+
+        logging.info("About to create dev dataset")
+        if self.args.on_the_fly_feats:
+            validate = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+            )
+        else:
+            validate = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+            )
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = K2SpeechRecognitionDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+        )
+        sampler = DynamicBucketingSampler(
+            cuts, max_duration=self.args.max_duration, shuffle=False
+        )
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def train_cuts(self) -> CutSet:
+        logging.info("About to get SPGISpeech train cuts")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_train_shuf.jsonl.gz")
+
+    @lru_cache()
+    def dev_cuts(self) -> CutSet:
+        logging.info("About to get SPGISpeech dev cuts")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_dev.jsonl.gz")
+
+    @lru_cache()
+    def val_cuts(self) -> CutSet:
+        logging.info("About to get SPGISpeech val cuts")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_val.jsonl.gz")
+
+
+def test():
+    parser = argparse.ArgumentParser()
+    SPGISpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    adm = SPGISpeechAsrDataModule(args)
+
+    cuts = adm.train_cuts()
+    dl = adm.train_dataloaders(cuts)
+    for i, batch in tqdm(enumerate(dl)):
+        if i == 100:
+            break
+
+    cuts = adm.dev_cuts()
+    dl = adm.valid_dataloaders(cuts)
+    for i, batch in tqdm(enumerate(dl)):
+        if i == 100:
+            break
+
+
+if __name__ == "__main__":
+    test()
--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/decode.py
@ -19,16 +19,14 @@
 Usage:
 (1) greedy search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
+        --avg-last-n 10 \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --max-duration 100 \
        --decoding-method greedy_search

 (2) beam search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
+        --avg-last-n 10 \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --max-duration 100 \
        --decoding-method beam_search \
@ -36,8 +34,7 @@ Usage:

 (3) modified beam search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
+        --avg-last-n 10 \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --max-duration 100 \
        --decoding-method modified_beam_search \
@ -45,8 +42,7 @@ Usage:

 (4) fast beam search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
+        --avg-last-n 10 \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --max-duration 1500 \
        --decoding-method fast_beam_search \
@ -66,7 +62,7 @@ import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
+from asr_datamodule import SPGISpeechAsrDataModule
 from beam_search import (
    beam_search,
    fast_beam_search,
@ -186,8 +182,7 @@ def get_parser():
        "--context-size",
        type=int,
        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
@ -245,9 +240,7 @@ def decode_one_batch(
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)

-    encoder_out, encoder_out_lens = model.encoder(
-        x=feature, x_lens=feature_lens
-    )
+    encoder_out, encoder_out_lens = model.encoder(x=feature, x_lens=feature_lens)
    hyps = []

    if params.decoding_method == "fast_beam_search":
@ -262,10 +255,7 @@ def decode_one_batch(
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
-    elif (
-        params.decoding_method == "greedy_search"
-        and params.max_sym_per_frame == 1
-    ):
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
@ -385,9 +375,7 @@ def decode_dataset(
        if batch_idx % log_interval == 0:
            batch_str = f"{batch_idx}/{num_batches}"

-            logging.info(
-                f"batch {batch_str}, cuts processed until now is {num_cuts}"
-            )
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
    return results


@ -419,8 +407,7 @@ def save_results(

    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
-        params.res_dir
-        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
@ -438,7 +425,7 @@ def save_results(
@torch.no_grad()
 def main():
    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
+    SPGISpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)

@ -514,16 +501,16 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    librispeech = LibriSpeechAsrDataModule(args)
+    spgispeech = SPGISpeechAsrDataModule(args)

-    test_clean_cuts = librispeech.test_clean_cuts()
-    test_other_cuts = librispeech.test_other_cuts()
+    dev_cuts = spgispeech.dev_cuts()
+    val_cuts = spgispeech.val_cuts()

-    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
-    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+    dev_dl = spgispeech.test_dataloaders(dev_cuts)
+    val_dl = spgispeech.test_dataloaders(val_cuts)

-    test_sets = ["test-clean", "test-other"]
-    test_dl = [test_clean_dl, test_other_dl]
+    test_sets = ["dev", "val"]
+    test_dl = [dev_dl, val_dl]

    for test_set, test_dl in zip(test_sets, test_dl):
        results_dict = decode_dataset(
--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/train.py
@ -168,7 +168,7 @@ def get_parser():
    parser.add_argument(
        "--lr-epochs",
        type=float,
-        default=6,
+        default=4,
        help="""Number of epochs that affects how rapidly the learning rate decreases.
        """,
    )
@ -243,7 +243,7 @@ def get_parser():
    parser.add_argument(
        "--keep-last-k",
        type=int,
-        default=20,
+        default=10,
        help="""Only keep this number of checkpoints on disk.
        For instance, if it is 3, there are only 3 checkpoints
        in the exp-dir with filenames `checkpoint-xxx.pt`.
@ -820,7 +820,7 @@ def run(rank, world_size, args):

    if params.print_diagnostics:
        opts = diagnostics.TensorDiagnosticOptions(
-            2 ** 22
+            2**22
        )  # allow 4 megabytes per sub-module
        diagnostic = diagnostics.attach_diagnostics(model, opts)

@ -828,6 +828,26 @@ def run(rank, world_size, args):

    train_cuts = spgispeech.train_cuts()

+    # Ideally we should filter utterances that are too long or too short, but SPGISpeech
+    # contains regular length utterances so we don't need to do that. Here are the
+    # statistics of the training data (obtained by `train_cuts.describe()`):
+
+    # Cuts count: 5886320
+    # Total duration (hours): 15070.1
+    # Speech duration (hours): 15070.1 (100.0%)
+    # ***
+    # Duration statistics (seconds):
+    # mean    9.2
+    # std     2.8
+    # min     4.6
+    # 25%     6.9
+    # 50%     8.9
+    # 75%     11.2
+    # 99%     16.0
+    # 99.5%   16.3
+    # 99.9%   16.6
+    # max     16.7
+
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
        # saved in the middle of an epoch
@ -901,6 +921,37 @@ def run(rank, world_size, args):
        cleanup_dist()


+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = sp.encode(supervisions["text"], out_type=int)
+    num_tokens = sum(len(i) for i in y)
+    logging.info(f"num tokens: {num_tokens}")
+
+
 def scan_pessimistic_batches_for_oom(
    model: nn.Module,
    train_dl: torch.utils.data.DataLoader,
@ -941,6 +992,7 @@ def scan_pessimistic_batches_for_oom(
                    f"Failing criterion: {criterion} "
                    f"(={crit_values[criterion]}) ..."
                )
+                display_and_save_batch(batch, params=params, sp=sp)
            raise