diff --git a/egs/librispeech/ASR/conv_emformer_transducer/asr_datamodule.py b/egs/librispeech/ASR/conv_emformer_transducer/asr_datamodule.py
deleted file mode 120000
index b4e5427e0..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/asr_datamodule.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/beam_search.py b/egs/librispeech/ASR/conv_emformer_transducer/beam_search.py
deleted file mode 120000
index 227d2247c..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/beam_search.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/beam_search.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/decode.py b/egs/librispeech/ASR/conv_emformer_transducer/decode.py
deleted file mode 100755
index 47b4f9fd0..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/decode.py
+++ /dev/null
@@ -1,550 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-(1) greedy search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 100 \
-        --decoding-method greedy_search
-
-(2) beam search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 100 \
-        --decoding-method beam_search \
-        --beam-size 4
-
-(3) modified beam search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 100 \
-        --decoding-method modified_beam_search \
-        --beam-size 4
-
-(4) fast beam search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 1500 \
-        --decoding-method fast_beam_search \
-        --beam 4 \
-        --max-contexts 4 \
-        --max-states 8
-"""
-
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
-from beam_search import (
-    beam_search,
-    fast_beam_search,
-    greedy_search,
-    greedy_search_batch,
-    modified_beam_search,
-)
-from train import add_model_arguments, get_params, get_transducer_model
-
-from icefall.checkpoint import (
-    average_checkpoints,
-    find_checkpoints,
-    load_checkpoint,
-)
-from icefall.utils import (
-    AttributeDict,
-    setup_logger,
-    store_transcripts,
-    write_error_stats,
-)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=28,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=15,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--avg-last-n",
-        type=int,
-        default=0,
-        help="""If positive, --epoch and --avg are ignored and it
-        will use the last n checkpoints exp_dir/checkpoint-xxx.pt
-        where xxx is the number of processed batches while
-        saving that checkpoint.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="transducer_emformer/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--decoding-method",
-        type=str,
-        default="greedy_search",
-        help="""Possible values are:
-          - greedy_search
-          - beam_search
-          - modified_beam_search
-          - fast_beam_search
-        """,
-    )
-
-    parser.add_argument(
-        "--beam-size",
-        type=int,
-        default=4,
-        help="""An interger indicating how many candidates we will keep for each
-        frame. Used only when --decoding-method is beam_search or
-        modified_beam_search.""",
-    )
-
-    parser.add_argument(
-        "--beam",
-        type=float,
-        default=4,
-        help="""A floating point value to calculate the cutoff score during beam
-        search (i.e., `cutoff = max-score - beam`), which is the same as the
-        `beam` in Kaldi.
-        Used only when --decoding-method is fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-contexts",
-        type=int,
-        default=4,
-        help="""Used only when --decoding-method is
-        fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-states",
-        type=int,
-        default=8,
-        help="""Used only when --decoding-method is
-        fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
-    )
-    parser.add_argument(
-        "--max-sym-per-frame",
-        type=int,
-        default=1,
-        help="""Maximum number of symbols per frame.
-        Used only when --decoding_method is greedy_search""",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    batch: dict,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[List[str]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if greedy_search is used, it would be "greedy_search"
-               If beam search with a beam size of 7 is used, it would be
-               "beam_7"
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    device = model.device
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-
-    feature = feature.to(device)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    encoder_out, encoder_out_lens = model.encoder(
-        x=feature, x_lens=feature_lens
-    )
-    hyps = []
-
-    if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
-            model=model,
-            decoding_graph=decoding_graph,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam,
-            max_contexts=params.max_contexts,
-            max_states=params.max_states,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif (
-        params.decoding_method == "greedy_search"
-        and params.max_sym_per_frame == 1
-    ):
-        hyp_tokens = greedy_search_batch(
-            model=model,
-            encoder_out=encoder_out,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif params.decoding_method == "modified_beam_search":
-        hyp_tokens = modified_beam_search(
-            model=model,
-            encoder_out=encoder_out,
-            beam=params.beam_size,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    else:
-        batch_size = encoder_out.size(0)
-
-        for i in range(batch_size):
-            # fmt: off
-            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-            # fmt: on
-            if params.decoding_method == "greedy_search":
-                hyp = greedy_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    max_sym_per_frame=params.max_sym_per_frame,
-                )
-            elif params.decoding_method == "beam_search":
-                hyp = beam_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    beam=params.beam_size,
-                )
-            else:
-                raise ValueError(
-                    f"Unsupported decoding method: {params.decoding_method}"
-                )
-            hyps.append(sp.decode(hyp).split())
-
-    if params.decoding_method == "greedy_search":
-        return {"greedy_search": hyps}
-    elif params.decoding_method == "fast_beam_search":
-        return {
-            (
-                f"beam_{params.beam}_"
-                f"max_contexts_{params.max_contexts}_"
-                f"max_states_{params.max_states}"
-            ): hyps
-        }
-    else:
-        return {f"beam_size_{params.beam_size}": hyps}
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search.
-    Returns:
-      Return a dict, whose key may be "greedy_search" if greedy search
-      is used, or it may be "beam_7" if beam size of 7 is used.
-      Its value is a list of tuples. Each tuple contains two elements:
-      The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    if params.decoding_method == "greedy_search":
-        log_interval = 100
-    else:
-        log_interval = 2
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            model=model,
-            sp=sp,
-            decoding_graph=decoding_graph,
-            batch=batch,
-        )
-
-        for name, hyps in hyps_dict.items():
-            this_batch = []
-            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
-
-            results[name].extend(this_batch)
-
-        num_cuts += len(texts)
-
-        if batch_idx % log_interval == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(
-                f"batch {batch_str}, cuts processed until now is {num_cuts}"
-            )
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
-):
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = (
-            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        store_transcripts(filename=recog_path, texts=results)
-        logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = (
-            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=True
-            )
-            test_set_wers[key] = wer
-
-        logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = (
-        params.res_dir
-        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
-    )
-    with open(errs_info, "w") as f:
-        print("settings\tWER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    assert params.decoding_method in (
-        "greedy_search",
-        "beam_search",
-        "fast_beam_search",
-        "modified_beam_search",
-    )
-    params.res_dir = params.exp_dir / params.decoding_method
-
-    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "fast_beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam}"
-        params.suffix += f"-max-contexts-{params.max_contexts}"
-        params.suffix += f"-max-states-{params.max_states}"
-    elif "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
-    else:
-        params.suffix += f"-context-{params.context_size}"
-        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
-
-    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"Device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_transducer_model(params)
-
-    if params.avg_last_n > 0:
-        filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n]
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-    elif params.avg == 1:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-
-    model.to(device)
-    model.eval()
-    model.device = device
-
-    if params.decoding_method == "fast_beam_search":
-        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
-    else:
-        decoding_graph = None
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    librispeech = LibriSpeechAsrDataModule(args)
-
-    test_clean_cuts = librispeech.test_clean_cuts()
-    test_other_cuts = librispeech.test_other_cuts()
-
-    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
-    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
-
-    test_sets = ["test-clean", "test-other"]
-    test_dl = [test_clean_dl, test_other_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dl):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            model=model,
-            sp=sp,
-            decoding_graph=decoding_graph,
-        )
-
-        save_results(
-            params=params,
-            test_set_name=test_set,
-            results_dict=results_dict,
-        )
-
-    logging.info("Done!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/decoder.py b/egs/librispeech/ASR/conv_emformer_transducer/decoder.py
deleted file mode 120000
index 0d5f10dc0..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/decoder.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/decoder.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/emformer.py b/egs/librispeech/ASR/conv_emformer_transducer/emformer.py
deleted file mode 100644
index 3c520f5c3..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/emformer.py
+++ /dev/null
@@ -1,1770 +0,0 @@
-# Copyright      2022  Xiaomi Corporation     (Author: Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# It is modified based on https://github.com/pytorch/audio/blob/main/torchaudio/models/emformer.py.  # noqa
-
-import math
-import warnings
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-from encoder_interface import EncoderInterface
-from subsampling import Conv2dSubsampling, VggSubsampling
-
-from icefall.utils import make_pad_mask
-
-
-def _gen_attention_mask_block(
-    col_widths: List[int],
-    col_mask: List[bool],
-    num_rows: int,
-    device: torch.device,
-) -> torch.Tensor:
-    assert len(col_widths) == len(
-        col_mask
-    ), "Length of col_widths must match that of col_mask"
-
-    mask_block = [
-        torch.ones(num_rows, col_width, device=device)
-        if is_ones_col
-        else torch.zeros(num_rows, col_width, device=device)
-        for col_width, is_ones_col in zip(col_widths, col_mask)
-    ]
-    return torch.cat(mask_block, dim=1)
-
-
-class EmformerAttention(nn.Module):
-    r"""Emformer layer attention module.
-
-    Args:
-      embed_dim (int):
-        Embedding dimension.
-      nhead (int):
-        Number of attention heads in each Emformer layer.
-      tanh_on_mem (bool, optional):
-        If ``True``, applies tanh to memory elements. (Default: ``False``)
-      negative_inf (float, optional):
-        Value to use for negative infinity in attention weights. (Default: -1e8)
-    """
-
-    def __init__(
-        self,
-        embed_dim: int,
-        nhead: int,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-    ):
-        super().__init__()
-
-        if embed_dim % nhead != 0:
-            raise ValueError(
-                f"embed_dim ({embed_dim}) is not a multiple of"
-                f"nhead ({nhead})."
-            )
-
-        self.embed_dim = embed_dim
-        self.nhead = nhead
-        self.tanh_on_mem = tanh_on_mem
-        self.negative_inf = negative_inf
-        self.head_dim = embed_dim // nhead
-
-        self.scaling = self.head_dim ** -0.5
-
-        self.emb_to_key_value = nn.Linear(embed_dim, 2 * embed_dim, bias=True)
-        self.emb_to_query = nn.Linear(embed_dim, embed_dim, bias=True)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
-
-        # linear transformation for positional encoding.
-        self.linear_pos = nn.Linear(embed_dim, embed_dim, bias=False)
-
-        # these two learnable bias are used in matrix c and matrix d
-        # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3  # noqa
-        self.pos_bias_u = nn.Parameter(torch.Tensor(nhead, self.head_dim))
-        self.pos_bias_v = nn.Parameter(torch.Tensor(nhead, self.head_dim))
-
-        self._reset_parameters()
-
-    def _reset_parameters(self) -> None:
-        nn.init.xavier_uniform_(self.emb_to_key_value.weight)
-        nn.init.constant_(self.emb_to_key_value.bias, 0.0)
-
-        nn.init.xavier_uniform_(self.emb_to_query.weight)
-        nn.init.constant_(self.emb_to_query.bias, 0.0)
-
-        nn.init.xavier_uniform_(self.out_proj.weight)
-        nn.init.constant_(self.out_proj.bias, 0.0)
-
-        nn.init.xavier_uniform_(self.linear_pos.weight)
-
-        nn.init.xavier_uniform_(self.pos_bias_u)
-        nn.init.xavier_uniform_(self.pos_bias_v)
-
-    def _gen_attention_probs(
-        self,
-        attention_weights: torch.Tensor,
-        attention_mask: torch.Tensor,
-        padding_mask: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """Given the entire attention weights, mask out unecessary connections
-        and optionally with padding positions, to obtain underlying chunk-wise
-        attention probabilities.
-
-        B: batch size;
-        Q: length of query;
-        KV: length of key and value.
-
-        Args:
-          attention_weights (torch.Tensor):
-            Attention weights computed on the entire concatenated tensor
-            with shape (B * nhead, Q, KV).
-          attention_mask (torch.Tensor):
-            Mask tensor where chunk-wise connections are filled with `False`,
-            and other unnecessary connections are filled with `True`,
-            with shape (Q, KV).
-          padding_mask (torch.Tensor, optional):
-            Mask tensor where the padding positions are fill with `True`,
-            and other positions are filled with `False`, with shapa `(B, KV)`.
-
-        Returns:
-          A tensor of shape (B * nhead, Q, KV).
-        """
-        attention_weights_float = attention_weights.float()
-        attention_weights_float = attention_weights_float.masked_fill(
-            attention_mask.unsqueeze(0), self.negative_inf
-        )
-        if padding_mask is not None:
-            Q = attention_weights.size(1)
-            B = attention_weights.size(0) // self.nhead
-            attention_weights_float = attention_weights_float.view(
-                B, self.nhead, Q, -1
-            )
-            attention_weights_float = attention_weights_float.masked_fill(
-                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
-                self.negative_inf,
-            )
-            attention_weights_float = attention_weights_float.view(
-                B * self.nhead, Q, -1
-            )
-
-        attention_probs = nn.functional.softmax(
-            attention_weights_float, dim=-1
-        ).type_as(attention_weights)
-
-        return attention_probs
-
-    def _rel_shift(self, x: torch.Tensor) -> torch.Tensor:
-        """Compute relative positional encoding.
-
-        Args:
-          x: Input tensor, of shape (B, nhead, U, PE).
-             U is the length of query vector.
-             For non-infer mode, PE = 2 * U - 1;
-             for infer mode, PE = L + 2 * U - 1.
-
-        Returns:
-          A tensor of shape (B, nhead, U, out_len).
-          For non-infer mode, out_len = U;
-          for infer mode, out_len = L + U.
-        """
-        B, nhead, U, PE = x.size()
-        B_stride = x.stride(0)
-        nhead_stride = x.stride(1)
-        U_stride = x.stride(2)
-        PE_stride = x.stride(3)
-        out_len = PE - (U - 1)
-        return x.as_strided(
-            size=(B, nhead, U, out_len),
-            stride=(B_stride, nhead_stride, U_stride - PE_stride, PE_stride),
-            storage_offset=PE_stride * (U - 1),
-        )
-
-    def _forward_impl(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        summary: torch.Tensor,
-        memory: torch.Tensor,
-        attention_mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-        left_context_key: Optional[torch.Tensor] = None,
-        left_context_val: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Underlying chunk-wise attention implementation.
-
-        L: length of left_context;
-        S: length of summary;
-        M: length of memory;
-        Q: length of attention query;
-        KV: length of attention key and value.
-
-        1) Concat right_context, utterance, summary,
-        and compute query tensor with length Q = R + U + S.
-        2) Concat memory, right_context, utterance,
-        and compute key, value tensors with length KV = M + R + U;
-        optionally with left_context_key and left_context_val (inference mode),
-        then KV = M + R + L + U.
-        3) Compute entire attention scores with query, key, and value,
-        then apply attention_mask to get underlying chunk-wise attention scores.
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          summary (torch.Tensor):
-            Summary elements, with shape (S, B, D).
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          attention_mask (torch.Tensor):
-            Attention mask for underlying attention, with shape (Q, KV).
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D).
-            For training mode, PE = 2*U-1;
-            For infer mode, PE = L+2*U-1.
-          left_context_key (torch,Tensor, optional):
-            Cached attention key of left context from preceding computation,
-            with shape (L, B, D).
-          left_context_val (torch.Tensor, optional):
-            Cached attention value of left context from preceding computation,
-            with shape (L, B, D).
-
-        Returns:
-          A tuple containing 4 tensors:
-            - output of right context and utterance, with shape (R + U, B, D).
-            - memory output, with shape (S, B, D).
-            - attention key, with shape (KV, B, D).
-            - attention value, with shape (KV, B, D).
-        """
-        U, B, _ = utterance.size()
-        R = right_context.size(0)
-        M = memory.size(0)
-
-        # Compute query with [right context, utterance, summary].
-        query = self.emb_to_query(
-            torch.cat([right_context, utterance, summary])
-        )
-        # Compute key and value with [mems, right context, utterance].
-        key, value = self.emb_to_key_value(
-            torch.cat([memory, right_context, utterance])
-        ).chunk(chunks=2, dim=2)
-
-        if left_context_key is not None and left_context_val is not None:
-            # This is for inference mode. Now compute key and value with
-            # [mems, right context, left context, uttrance]
-            key = torch.cat(
-                [key[: M + R], left_context_key, key[M + R :]]  # noqa
-            )
-            value = torch.cat(
-                [value[: M + R], left_context_val, value[M + R :]]  # noqa
-            )
-        Q = query.size(0)
-        KV = key.size(0)
-
-        reshaped_key, reshaped_value = [
-            tensor.contiguous()
-            .view(KV, B * self.nhead, self.head_dim)
-            .transpose(0, 1)
-            for tensor in [key, value]
-        ]  # (B * nhead, KV, head_dim)
-        reshaped_query = query.contiguous().view(
-            Q, B, self.nhead, self.head_dim
-        )
-
-        # compute attention matrix ac
-        query_with_bais_u = (
-            (reshaped_query + self.pos_bias_u)
-            .view(Q, B * self.nhead, self.head_dim)
-            .transpose(0, 1)
-        )
-        matrix_ac = torch.bmm(
-            query_with_bais_u, reshaped_key.transpose(1, 2)
-        )  # (B * nhead, Q, KV)
-
-        # compute attention matrix bd
-        utterance_with_bais_v = (
-            reshaped_query[R : R + U] + self.pos_bias_v
-        ).permute(1, 2, 0, 3)
-        # (B, nhead, U, head_dim)
-        PE = pos_emb.size(0)
-        if left_context_key is not None and left_context_val is not None:
-            L = left_context_key.size(0)
-            assert PE == L + 2 * U - 1
-        else:
-            assert PE == 2 * U - 1
-        pos_emb = (
-            self.linear_pos(pos_emb)
-            .view(PE, self.nhead, self.head_dim)
-            .transpose(0, 1)
-            .unsqueeze(0)
-        )  # (1, nhead, PE, head_dim)
-        matrix_bd_utterance = torch.matmul(
-            utterance_with_bais_v, pos_emb.transpose(-2, -1)
-        )  # (B, nhead, U, PE)
-        # rel-shift
-        matrix_bd_utterance = self._rel_shift(
-            matrix_bd_utterance
-        )  # (B, nhead, U, U or L + U)
-        matrix_bd_utterance = matrix_bd_utterance.contiguous().view(
-            B * self.nhead, U, -1
-        )
-        matrix_bd = torch.zeros_like(matrix_ac)
-        matrix_bd[:, R : R + U, M + R :] = matrix_bd_utterance
-
-        attention_weights = (matrix_ac + matrix_bd) * self.scaling
-
-        # Compute padding mask
-        if B == 1:
-            padding_mask = None
-        else:
-            padding_mask = make_pad_mask(KV - U + lengths)
-
-        # Compute attention probabilities.
-        attention_probs = self._gen_attention_probs(
-            attention_weights, attention_mask, padding_mask
-        )
-
-        # Compute attention.
-        attention = torch.bmm(attention_probs, reshaped_value)
-        assert attention.shape == (B * self.nhead, Q, self.head_dim)
-        attention = (
-            attention.transpose(0, 1).contiguous().view(Q, B, self.embed_dim)
-        )
-
-        # Apply output projection.
-        outputs = self.out_proj(attention)
-
-        output_right_context_utterance = outputs[: R + U]
-        output_memory = outputs[R + U :]
-        if self.tanh_on_mem:
-            output_memory = torch.tanh(output_memory)
-        else:
-            output_memory = torch.clamp(output_memory, min=-10, max=10)
-
-        return output_right_context_utterance, output_memory, key, value
-
-    def forward(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        summary: torch.Tensor,
-        memory: torch.Tensor,
-        attention_mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # TODO: Modify docs.
-        """Forward pass for training.
-
-        B: batch size;
-        D: embedding dimension;
-        R: length of right_context;
-        U: length of utterance;
-        S: length of summary;
-        M: length of memory.
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          summary (torch.Tensor):
-            Summary elements, with shape (S, B, D).
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          attention_mask (torch.Tensor):
-            Attention mask for underlying chunk-wise attention,
-            with shape (Q, KV), where Q = R + U + S, KV = M + R + U.
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D).
-            For training mode, P = 2*U-1.
-
-        Returns:
-          A tuple containing 2 tensors:
-            - output of right context and utterance, with shape (R + U, B, D).
-            - memory output, with shape (M, B, D), where M = S - 1 or M = 0.
-        """
-        (
-            output_right_context_utterance,
-            output_memory,
-            _,
-            _,
-        ) = self._forward_impl(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            attention_mask,
-            pos_emb,
-        )
-        return output_right_context_utterance, output_memory[:-1]
-
-    @torch.jit.export
-    def infer(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        summary: torch.Tensor,
-        memory: torch.Tensor,
-        left_context_key: torch.Tensor,
-        left_context_val: torch.Tensor,
-        pos_emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Forward pass for inference.
-
-        B: batch size;
-        D: embedding dimension;
-        R: length of right_context;
-        U: length of utterance;
-        L: length of left_context;
-        S: length of summary;
-        M: length of memory;
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          summary (torch.Tensor):
-            Summary element, with shape (1, B, D), or empty.
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          left_context_key (torch,Tensor):
-            Cached attention key of left context from preceding computation,
-            with shape (L, B, D).
-          left_context_val (torch.Tensor):
-            Cached attention value of left context from preceding computation,
-            with shape (L, B, D).
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D).
-            For infer mode, PE = L+2*U-1.
-
-        Returns:
-          A tuple containing 4 tensors:
-            - output of right context and utterance, with shape (R + U, B, D).
-            - memory output, with shape (1, B, D) or (0, B, D).
-            - attention key of left context and utterance, which would be cached
-              for next computation, with shape (L + U, B, D).
-            - attention value of left context and utterance, which would be
-              cached for next computation, with shape (L + U, B, D).
-        """
-        # query: [right context, utterance, summary]
-        Q = right_context.size(0) + utterance.size(0) + summary.size(0)
-        # key, value: [memory, right context, left context, uttrance]
-        KV = (
-            memory.size(0)
-            + right_context.size(0)  # noqa
-            + left_context_key.size(0)  # noqa
-            + utterance.size(0)  # noqa
-        )
-        attention_mask = torch.zeros(Q, KV).to(
-            dtype=torch.bool, device=utterance.device
-        )
-        # Disallow attention bettween the summary vector with the memory bank
-        attention_mask[-1, : memory.size(0)] = True
-        (
-            output_right_context_utterance,
-            output_memory,
-            key,
-            value,
-        ) = self._forward_impl(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            attention_mask,
-            pos_emb,
-            left_context_key=left_context_key,
-            left_context_val=left_context_val,
-        )
-        right_context_end_idx = memory.size(0) + right_context.size(0)
-        return (
-            output_right_context_utterance,
-            output_memory,
-            key[right_context_end_idx:],
-            value[right_context_end_idx:],
-        )
-
-
-class EmformerLayer(nn.Module):
-    """Emformer layer that constitutes Emformer.
-
-    Args:
-      d_model (int):
-        Input dimension.
-      nhead (int):
-        Number of attention heads.
-      dim_feedforward (int):
-        Hidden layer dimension of feedforward network.
-      chunk_length (int):
-        Length of each input segment.
-      dropout (float, optional):
-        Dropout probability. (Default: 0.0)
-      cnn_module_kernel (int):
-        Kernel size of convolution module.
-      left_context_length (int, optional):
-        Length of left context. (Default: 0)
-      max_memory_size (int, optional):
-        Maximum number of memory elements to use. (Default: 0)
-      tanh_on_mem (bool, optional):
-        If ``True``, applies tanh to memory elements. (Default: ``False``)
-      negative_inf (float, optional):
-        Value to use for negative infinity in attention weights. (Default: -1e8)
-      causal (bool):
-        Whether use causal convolution (default=False).
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int,
-        chunk_length: int,
-        dropout: float = 0.0,
-        cnn_module_kernel: int = 3,
-        left_context_length: int = 0,
-        max_memory_size: int = 0,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-        causal: bool = True,
-    ):
-        super().__init__()
-
-        self.attention = EmformerAttention(
-            embed_dim=d_model,
-            nhead=nhead,
-            tanh_on_mem=tanh_on_mem,
-            negative_inf=negative_inf,
-        )
-        self.summary_op = nn.AvgPool1d(
-            kernel_size=chunk_length, stride=chunk_length, ceil_mode=True
-        )
-
-        self.feed_forward_macaron = nn.Sequential(
-            nn.Linear(d_model, dim_feedforward),
-            Swish(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_feedforward, d_model),
-        )
-
-        self.feed_forward = nn.Sequential(
-            nn.Linear(d_model, dim_feedforward),
-            Swish(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_feedforward, d_model),
-        )
-
-        self.conv_module = ConvolutionModule(
-            d_model,
-            cnn_module_kernel,
-            causal=causal,
-        )
-
-        self.norm_ff_macaron = nn.LayerNorm(d_model)
-        self.norm_ff = nn.LayerNorm(d_model)
-        self.norm_mha = nn.LayerNorm(d_model)
-        self.norm_conv = nn.LayerNorm(d_model)
-        self.norm_final = nn.LayerNorm(d_model)
-
-        self.dropout = nn.Dropout(dropout)
-
-        self.ff_scale = 0.5
-        self.left_context_length = left_context_length
-        self.chunk_length = chunk_length
-        self.max_memory_size = max_memory_size
-        self.d_model = d_model
-        self.use_memory = max_memory_size > 0
-
-    def _init_state(
-        self, batch_size: int, device: Optional[torch.device]
-    ) -> List[torch.Tensor]:
-        """Initialize states with zeros."""
-        empty_memory = torch.zeros(
-            self.max_memory_size, batch_size, self.d_model, device=device
-        )
-        left_context_key = torch.zeros(
-            self.left_context_length, batch_size, self.d_model, device=device
-        )
-        left_context_val = torch.zeros(
-            self.left_context_length, batch_size, self.d_model, device=device
-        )
-        past_length = torch.zeros(
-            1, batch_size, dtype=torch.int32, device=device
-        )
-        return [empty_memory, left_context_key, left_context_val, past_length]
-
-    def _unpack_state(
-        self, state: List[torch.Tensor]
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Unpack cached states including:
-        1) output memory from previous chunks in the lower layer;
-        2) attention key and value of left context from proceeding chunk's
-        computation.
-        """
-        past_length = state[3][0][0].item()
-        past_left_context_length = min(self.left_context_length, past_length)
-        past_memory_length = min(
-            self.max_memory_size, math.ceil(past_length / self.chunk_length)
-        )
-        memory_start_idx = self.max_memory_size - past_memory_length
-        pre_memory = state[0][memory_start_idx:]
-        left_context_start_idx = (
-            self.left_context_length - past_left_context_length
-        )
-        left_context_key = state[1][left_context_start_idx:]
-        left_context_val = state[2][left_context_start_idx:]
-        return pre_memory, left_context_key, left_context_val
-
-    def _pack_state(
-        self,
-        next_key: torch.Tensor,
-        next_val: torch.Tensor,
-        update_length: int,
-        memory: torch.Tensor,
-        state: List[torch.Tensor],
-    ) -> List[torch.Tensor]:
-        """Pack updated states including:
-        1) output memory of current chunk in the lower layer;
-        2) attention key and value in current chunk's computation, which would
-        be resued in next chunk's computation.
-        3) length of current chunk.
-        """
-        new_memory = torch.cat([state[0], memory])
-        new_key = torch.cat([state[1], next_key])
-        new_val = torch.cat([state[2], next_val])
-        memory_start_idx = new_memory.size(0) - self.max_memory_size
-        state[0] = new_memory[memory_start_idx:]
-        key_start_idx = new_key.size(0) - self.left_context_length
-        state[1] = new_key[key_start_idx:]
-        val_start_idx = new_val.size(0) - self.left_context_length
-        state[2] = new_val[val_start_idx:]
-        state[3] = state[3] + update_length
-        return state
-
-    def _apply_macaron_feed_foward_module(
-        self, right_context_utterance: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Apply macaron style feed forward module."""
-        residual = right_context_utterance
-        right_context_utterance = self.norm_ff_macaron(right_context_utterance)
-        right_context_utterance = residual + self.ff_scale * self.dropout(
-            self.feed_forward_macaron(right_context_utterance)
-        )
-        return right_context_utterance
-
-    def _apply_feed_forward_module(
-        self, right_context_utterance: torch.Tensor
-    ) -> torch.Tensor:
-        """Apply feed forward module."""
-        residual = right_context_utterance
-        right_context_utterance = self.norm_ff(right_context_utterance)
-        right_context_utterance = residual + self.ff_scale * self.dropout(
-            self.feed_forward(right_context_utterance)
-        )
-        return right_context_utterance
-
-    def _apply_conv_module_forward(
-        self,
-        right_context_utterance: torch.Tensor,
-        right_context_end_idx: int,
-    ) -> torch.Tensor:
-        """Apply convolution module on utterance in non-infer mode."""
-        utterance = right_context_utterance[right_context_end_idx:]
-        right_context = right_context_utterance[:right_context_end_idx]
-
-        residual = utterance
-        utterance = self.norm_conv(utterance)
-        utterance, _ = self.conv_module(utterance)
-        utterance = residual + self.dropout(utterance)
-        right_context_utterance = torch.cat([right_context, utterance])
-        return right_context_utterance
-
-    def _apply_conv_module_infer(
-        self,
-        right_context_utterance: torch.Tensor,
-        right_context_end_idx: int,
-        conv_cache: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Apply convolution module on utterance in infer mode."""
-        utterance = right_context_utterance[right_context_end_idx:]
-        right_context = right_context_utterance[:right_context_end_idx]
-
-        residual = utterance
-        utterance = self.norm_conv(utterance)
-        utterance, conv_cache = self.conv_module(utterance, conv_cache)
-        utterance = residual + self.dropout(utterance)
-        right_context_utterance = torch.cat([right_context, utterance])
-        return right_context_utterance, conv_cache
-
-    def _apply_attention_module_forward(
-        self,
-        right_context_utterance: torch.Tensor,
-        right_context_end_idx: int,
-        lengths: torch.Tensor,
-        memory: torch.Tensor,
-        pos_emb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Apply attention module in non-infer mode."""
-        if attention_mask is None:
-            raise ValueError(
-                "attention_mask must be not None in non-infer mode. "
-            )
-
-        residual = right_context_utterance
-        right_context_utterance = self.norm_mha(right_context_utterance)
-        utterance = right_context_utterance[right_context_end_idx:]
-        right_context = right_context_utterance[:right_context_end_idx]
-
-        if self.use_memory:
-            summary = self.summary_op(utterance.permute(1, 2, 0)).permute(
-                2, 0, 1
-            )
-        else:
-            summary = torch.empty(0).to(
-                dtype=utterance.dtype, device=utterance.device
-            )
-        output_right_context_utterance, output_memory = self.attention(
-            utterance=utterance,
-            lengths=lengths,
-            right_context=right_context,
-            summary=summary,
-            memory=memory,
-            attention_mask=attention_mask,
-            pos_emb=pos_emb,
-        )
-        right_context_utterance = residual + self.dropout(
-            output_right_context_utterance
-        )
-
-        return right_context_utterance, output_memory
-
-    def _apply_attention_module_infer(
-        self,
-        right_context_utterance: torch.Tensor,
-        right_context_end_idx: int,
-        lengths: torch.Tensor,
-        memory: torch.Tensor,
-        pos_emb: torch.Tensor,
-        state: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
-        """Apply attention in infer mode.
-        1) Unpack cached states including:
-          - memory from previous chunks in the lower layer;
-          - attention key and value of left context from proceeding
-            chunk's compuation;
-        2) Apply attention computation;
-        3) Pack updated states including:
-          - output memory of current chunk in the lower layer;
-          - attention key and value in current chunk's computation, which would
-            be resued in next chunk's computation.
-          - length of current chunk.
-        """
-        residual = right_context_utterance
-        right_context_utterance = self.norm_mha(right_context_utterance)
-        utterance = right_context_utterance[right_context_end_idx:]
-        right_context = right_context_utterance[:right_context_end_idx]
-
-        if state is None:
-            state = self._init_state(utterance.size(1), device=utterance.device)
-        pre_memory, left_context_key, left_context_val = self._unpack_state(
-            state
-        )
-        if self.use_memory:
-            summary = self.summary_op(utterance.permute(1, 2, 0)).permute(
-                2, 0, 1
-            )
-            summary = summary[:1]
-        else:
-            summary = torch.empty(0).to(
-                dtype=utterance.dtype, device=utterance.device
-            )
-        # pos_emb is of shape [PE, D], PE = L + 2 * U - 1,
-        # the relative distance j - i of key(j) and query(i) is in range of [-(L + U - 1), (U - 1)]  # noqa
-        L = left_context_key.size(0)  # L <= left_context_length
-        U = utterance.size(0)
-        PE = L + 2 * U - 1
-        tot_PE = self.left_context_length + 2 * U - 1
-        assert pos_emb.size(0) == tot_PE
-        pos_emb = pos_emb[tot_PE - PE :]
-        (
-            output_right_context_utterance,
-            output_memory,
-            next_key,
-            next_val,
-        ) = self.attention.infer(
-            utterance=utterance,
-            lengths=lengths,
-            right_context=right_context,
-            summary=summary,
-            memory=pre_memory,
-            left_context_key=left_context_key,
-            left_context_val=left_context_val,
-            pos_emb=pos_emb,
-        )
-        right_context_utterance = residual + self.dropout(
-            output_right_context_utterance
-        )
-        state = self._pack_state(
-            next_key, next_val, utterance.size(0), memory, state
-        )
-        return right_context_utterance, output_memory, state
-
-    def forward(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        memory: torch.Tensor,
-        attention_mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        r"""Forward pass for training.
-        1) Apply layer normalization on input utterance and right context
-        before attention;
-        2) Apply attention module, compute updated utterance, right context,
-        and memory;
-        3) Apply feed forward module and layer normalization on output utterance
-        and right context.
-
-        B: batch size;
-        D: embedding dimension;
-        R: length of right_context;
-        U: length of utterance;
-        M: length of memory.
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          attention_mask (torch.Tensor):
-            Attention mask for underlying attention module,
-            with shape (Q, KV), where Q = R + U + S, KV = M + R + U.
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D).
-            For training mode, P = 2*U-1.
-
-        Returns:
-          A tuple containing 3 tensors:
-            - output utterance, with shape (U, B, D).
-            - output right context, with shape (R, B, D).
-            - output memory, with shape (M, B, D).
-        """
-        right_context_utterance = torch.cat([right_context, utterance])
-        right_context_end_idx = right_context.size(0)
-
-        right_context_utterance = self._apply_macaron_feed_foward_module(
-            right_context_utterance
-        )
-
-        (
-            right_context_utterance,
-            output_memory,
-        ) = self._apply_attention_module_forward(
-            right_context_utterance,
-            right_context_end_idx,
-            lengths,
-            memory,
-            pos_emb,
-            attention_mask,
-        )
-
-        right_context_utterance = self._apply_conv_module_forward(
-            right_context_utterance, right_context_end_idx
-        )
-
-        right_context_utterance = self._apply_feed_forward_module(
-            right_context_utterance
-        )
-
-        right_context_utterance = self.norm_final(right_context_utterance)
-
-        output_utterance = right_context_utterance[right_context_end_idx:]
-        output_right_context = right_context_utterance[:right_context_end_idx]
-        return output_utterance, output_right_context, output_memory
-
-    @torch.jit.export
-    def infer(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        memory: torch.Tensor,
-        pos_emb: torch.Tensor,
-        state: Optional[List[torch.Tensor]] = None,
-        conv_cache: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]:
-        """Forward pass for inference.
-
-         1) Apply layer normalization on input utterance and right context
-         before attention;
-         2) Apply attention module with cached state, compute updated utterance,
-         right context, and memory, and update state;
-         3) Apply feed forward module and layer normalization on output
-         utterance and right context.
-
-         B: batch size;
-         D: embedding dimension;
-         R: length of right_context;
-         U: length of utterance;
-         M: length of memory.
-
-        Args:
-           utterance (torch.Tensor):
-             Utterance frames, with shape (U, B, D).
-           lengths (torch.Tensor):
-             With shape (B,) and i-th element representing
-             number of valid frames for i-th batch element in utterance.
-           right_context (torch.Tensor):
-             Right context frames, with shape (R, B, D).
-           memory (torch.Tensor):
-             Memory elements, with shape (M, B, D).
-           state (List[torch.Tensor], optional):
-             List of tensors representing layer internal state generated in
-             preceding computation. (default=None)
-           pos_emb (torch.Tensor):
-             Position encoding embedding, with shape (PE, D).
-             For infer mode, PE = L+2*U-1.
-           conv_cache (torch.Tensor, optional):
-             Cache tensor of left context for causal convolution.
-
-         Returns:
-           (Tensor, Tensor, List[torch.Tensor], Tensor):
-             - output utterance, with shape (U, B, D);
-             - output right_context, with shape (R, B, D);
-             - output memory, with shape (1, B, D) or (0, B, D).
-             - output state.
-             - updated conv_cache.
-        """
-        right_context_utterance = torch.cat([right_context, utterance])
-        right_context_end_idx = right_context.size(0)
-
-        right_context_utterance = self._apply_macaron_feed_foward_module(
-            right_context_utterance
-        )
-
-        (
-            right_context_utterance,
-            output_memory,
-            output_state,
-        ) = self._apply_attention_module_infer(
-            right_context_utterance,
-            right_context_end_idx,
-            lengths,
-            memory,
-            pos_emb,
-            state,
-        )
-
-        right_context_utterance, conv_cache = self._apply_conv_module_infer(
-            right_context_utterance,
-            right_context_end_idx,
-            conv_cache,
-        )
-
-        right_context_utterance = self._apply_feed_forward_module(
-            right_context_utterance
-        )
-
-        right_context_utterance = self.norm_final(right_context_utterance)
-
-        output_utterance = right_context_utterance[right_context_end_idx:]
-        output_right_context = right_context_utterance[:right_context_end_idx]
-        return (
-            output_utterance,
-            output_right_context,
-            output_memory,
-            output_state,
-            conv_cache,
-        )
-
-
-class EmformerEncoder(nn.Module):
-    """Implements the Emformer architecture introduced in
-    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency
-    Streaming Speech Recognition*
-    [:footcite:`shi2021emformer`].
-
-    Args:
-      d_model (int):
-        Input dimension.
-      nhead (int):
-        Number of attention heads in each emformer layer.
-      dim_feedforward (int):
-        Hidden layer dimension of each emformer layer's feedforward network.
-      num_encoder_layers (int):
-        Number of emformer layers to instantiate.
-      chunk_length (int):
-        Length of each input segment.
-      dropout (float, optional):
-        Dropout probability. (default: 0.0)
-      left_context_length (int, optional):
-        Length of left context. (default: 0)
-      right_context_length (int, optional):
-        Length of right context. (default: 0)
-      max_memory_size (int, optional):
-        Maximum number of memory elements to use. (default: 0)
-      tanh_on_mem (bool, optional):
-        If ``true``, applies tanh to memory elements. (default: ``false``)
-      negative_inf (float, optional):
-        Value to use for negative infinity in attention weights. (default: -1e8)
-      causal (bool):
-        Whether use causal convolution (default=False).
-    """
-
-    def __init__(
-        self,
-        chunk_length: int,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        dropout: float = 0.1,
-        cnn_module_kernel: int = 3,
-        left_context_length: int = 0,
-        right_context_length: int = 0,
-        max_memory_size: int = 0,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-        causal: bool = True,
-    ):
-        super().__init__()
-
-        self.use_memory = max_memory_size > 0
-        self.init_memory_op = nn.AvgPool1d(
-            kernel_size=chunk_length,
-            stride=chunk_length,
-            ceil_mode=True,
-        )
-
-        self.emformer_layers = nn.ModuleList(
-            [
-                EmformerLayer(
-                    d_model,
-                    nhead,
-                    dim_feedforward,
-                    chunk_length,
-                    dropout=dropout,
-                    cnn_module_kernel=cnn_module_kernel,
-                    left_context_length=left_context_length,
-                    max_memory_size=max_memory_size,
-                    tanh_on_mem=tanh_on_mem,
-                    negative_inf=negative_inf,
-                    causal=causal,
-                )
-                for layer_idx in range(num_encoder_layers)
-            ]
-        )
-
-        self.encoder_pos = RelPositionalEncoding(d_model, dropout)
-
-        self.left_context_length = left_context_length
-        self.right_context_length = right_context_length
-        self.chunk_length = chunk_length
-        self.max_memory_size = max_memory_size
-
-    def _gen_right_context(self, x: torch.Tensor) -> torch.Tensor:
-        """Hard copy each chunk's right context and concat them."""
-        T = x.shape[0]
-        num_segs = math.ceil(
-            (T - self.right_context_length) / self.chunk_length
-        )
-        right_context_blocks = []
-        for seg_idx in range(num_segs - 1):
-            start = (seg_idx + 1) * self.chunk_length
-            end = start + self.right_context_length
-            right_context_blocks.append(x[start:end])
-        last_right_context_start_idx = T - self.right_context_length
-        right_context_blocks.append(x[last_right_context_start_idx:])
-        return torch.cat(right_context_blocks)
-
-    def _gen_attention_mask_col_widths(
-        self, chunk_idx: int, U: int
-    ) -> List[int]:
-        """Calculate column widths (key, value) in attention mask for the
-        chunk_idx chunk."""
-        num_chunks = math.ceil(U / self.chunk_length)
-        rc = self.right_context_length
-        lc = self.left_context_length
-        rc_start = chunk_idx * rc
-        rc_end = rc_start + rc
-        chunk_start = max(chunk_idx * self.chunk_length - lc, 0)
-        chunk_end = min((chunk_idx + 1) * self.chunk_length, U)
-        R = rc * num_chunks
-
-        if self.use_memory:
-            m_start = max(chunk_idx - self.max_memory_size, 0)
-            M = num_chunks - 1
-            col_widths = [
-                m_start,  # before memory
-                chunk_idx - m_start,  # memory
-                M - chunk_idx,  # after memory
-                rc_start,  # before right context
-                rc,  # right context
-                R - rc_end,  # after right context
-                chunk_start,  # before chunk
-                chunk_end - chunk_start,  # chunk
-                U - chunk_end,  # after chunk
-            ]
-        else:
-            col_widths = [
-                rc_start,  # before right context
-                rc,  # right context
-                R - rc_end,  # after right context
-                chunk_start,  # before chunk
-                chunk_end - chunk_start,  # chunk
-                U - chunk_end,  # after chunk
-            ]
-
-        return col_widths
-
-    def _gen_attention_mask(self, utterance: torch.Tensor) -> torch.Tensor:
-        """Generate attention mask for underlying chunk-wise attention
-        computation, where chunk-wise connections are filled with `False`,
-        and other unnecessary connections beyond chunk are filled with `True`.
-
-        R: length of right_context;
-        U: length of utterance;
-        S: length of summary;
-        M: length of memory;
-        Q: length of attention query;
-        KV: length of attention key and value.
-
-        The shape of attention mask is (Q, KV).
-        If self.use_memory is `True`:
-          query = [right_context, utterance, summary];
-          key, value = [memory, right_context, utterance];
-          Q = R + U + S, KV = M + R + U.
-        Otherwise:
-          query = [right_context, utterance]
-          key, value = [right_context, utterance]
-          Q = R + U, KV = R + U.
-        """
-        U = utterance.size(0)
-        num_chunks = math.ceil(U / self.chunk_length)
-
-        right_context_mask = []
-        utterance_mask = []
-        summary_mask = []
-
-        if self.use_memory:
-            num_cols = 9
-            # right context and utterance both attend to memory, right context,
-            # utterance
-            right_context_utterance_cols_mask = [
-                idx in [1, 4, 7] for idx in range(num_cols)
-            ]
-            # summary attends to right context, utterance
-            summary_cols_mask = [idx in [4, 7] for idx in range(num_cols)]
-            masks_to_concat = [right_context_mask, utterance_mask, summary_mask]
-        else:
-            num_cols = 6
-            # right context and utterance both attend to right context and
-            # utterance
-            right_context_utterance_cols_mask = [
-                idx in [1, 4] for idx in range(num_cols)
-            ]
-            summary_cols_mask = None
-            masks_to_concat = [right_context_mask, utterance_mask]
-
-        for chunk_idx in range(num_chunks):
-            col_widths = self._gen_attention_mask_col_widths(chunk_idx, U)
-
-            right_context_mask_block = _gen_attention_mask_block(
-                col_widths,
-                right_context_utterance_cols_mask,
-                self.right_context_length,
-                utterance.device,
-            )
-            right_context_mask.append(right_context_mask_block)
-
-            utterance_mask_block = _gen_attention_mask_block(
-                col_widths,
-                right_context_utterance_cols_mask,
-                min(
-                    self.chunk_length,
-                    U - chunk_idx * self.chunk_length,
-                ),
-                utterance.device,
-            )
-            utterance_mask.append(utterance_mask_block)
-
-            if summary_cols_mask is not None:
-                summary_mask_block = _gen_attention_mask_block(
-                    col_widths, summary_cols_mask, 1, utterance.device
-                )
-                summary_mask.append(summary_mask_block)
-
-        attention_mask = (
-            1 - torch.cat([torch.cat(mask) for mask in masks_to_concat])
-        ).to(torch.bool)
-        return attention_mask
-
-    def forward(
-        self, x: torch.Tensor, lengths: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Forward pass for training and non-streaming inference.
-
-        B: batch size;
-        D: input dimension;
-        U: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (U + right_context_length, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, which contains the
-            right_context at the end.
-
-        Returns:
-          A tuple of 2 tensors:
-            - output utterance frames, with shape (U, B, D).
-            - output_lengths, with shape (B,), without containing the
-              right_context at the end.
-        """
-        U = x.size(0) - self.right_context_length
-        x, pos_emb = self.encoder_pos(x, pos_len=U, neg_len=U)
-
-        right_context = self._gen_right_context(x)
-        utterance = x[:U]
-        output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
-        attention_mask = self._gen_attention_mask(utterance)
-        memory = (
-            self.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[
-                :-1
-            ]
-            if self.use_memory
-            else torch.empty(0).to(dtype=x.dtype, device=x.device)
-        )
-
-        output = utterance
-        for layer in self.emformer_layers:
-            output, right_context, memory = layer(
-                output,
-                output_lengths,
-                right_context,
-                memory,
-                attention_mask,
-                pos_emb,
-            )
-
-        return output, output_lengths
-
-    @torch.jit.export
-    def infer(
-        self,
-        x: torch.Tensor,
-        lengths: torch.Tensor,
-        states: Optional[List[List[torch.Tensor]]] = None,
-        conv_caches: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[
-        torch.Tensor, torch.Tensor, List[List[torch.Tensor]], List[torch.Tensor]
-    ]:
-        """Forward pass for streaming inference.
-
-        B: batch size;
-        D: input dimension;
-        U: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (U + right_context_length, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, which contains the
-            right_context at the end.
-          states (List[List[torch.Tensor]], optional):
-            Cached states from proceeding chunk's computation, where each
-            element (List[torch.Tensor]) corresponds to each emformer layer.
-            (default: None)
-          conv_caches (List[torch.Tensor], optional):
-            Cached tensors of left context for causal convolution, where each
-            element (Tensor) corresponds to each convolutional layer.
-        Returns:
-          (Tensor, Tensor, List[List[torch.Tensor]], List[torch.Tensor]):
-            - output utterance frames, with shape (U, B, D).
-            - output lengths, with shape (B,), without containing the
-              right_context at the end.
-            - updated states from current chunk's computation.
-            - updated convolution caches from current chunk.
-        """
-        assert x.size(0) == self.chunk_length + self.right_context_length, (
-            "Per configured chunk_length and right_context_length, "
-            f"expected size of {self.chunk_length + self.right_context_length} "
-            f"for dimension 1 of x, but got {x.size(1)}."
-        )
-
-        pos_len = self.chunk_length + self.left_context_length
-        neg_len = self.chunk_length
-        x, pos_emb = self.encoder_pos(x, pos_len=pos_len, neg_len=neg_len)
-
-        right_context_start_idx = x.size(0) - self.right_context_length
-        right_context = x[right_context_start_idx:]
-        utterance = x[:right_context_start_idx]
-        output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
-        memory = (
-            self.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
-            if self.use_memory
-            else torch.empty(0).to(dtype=x.dtype, device=x.device)
-        )
-        output = utterance
-        output_states: List[List[torch.Tensor]] = []
-        output_conv_caches: List[torch.Tensor] = []
-        for layer_idx, layer in enumerate(self.emformer_layers):
-            (
-                output,
-                right_context,
-                memory,
-                output_state,
-                output_conv_cache,
-            ) = layer.infer(
-                output,
-                output_lengths,
-                right_context,
-                memory,
-                pos_emb,
-                None if states is None else states[layer_idx],
-                None if conv_caches is None else conv_caches[layer_idx],
-            )
-            output_states.append(output_state)
-            output_conv_caches.append(output_conv_cache)
-
-        return output, output_lengths, output_states, output_conv_caches
-
-
-class Emformer(EncoderInterface):
-    def __init__(
-        self,
-        num_features: int,
-        output_dim: int,
-        chunk_length: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        dropout: float = 0.1,
-        cnn_module_kernel: int = 3,
-        vgg_frontend: bool = False,
-        left_context_length: int = 0,
-        right_context_length: int = 0,
-        max_memory_size: int = 0,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-        causal: bool = True,
-    ):
-        super().__init__()
-
-        self.subsampling_factor = subsampling_factor
-        self.right_context_length = right_context_length
-        if subsampling_factor != 4:
-            raise NotImplementedError("Support only 'subsampling_factor=4'.")
-        if chunk_length % 4 != 0:
-            raise NotImplementedError("chunk_length must be a mutiple of 4.")
-        if left_context_length != 0 and left_context_length % 4 != 0:
-            raise NotImplementedError(
-                "left_context_length must be 0 or a mutiple of 4."
-            )
-        if right_context_length != 0 and right_context_length % 4 != 0:
-            raise NotImplementedError(
-                "right_context_length must be 0 or a mutiple of 4."
-            )
-
-        # self.encoder_embed converts the input of shape (N, T, num_features)
-        # to the shape (N, T//subsampling_factor, d_model).
-        # That is, it does two things simultaneously:
-        #   (1) subsampling: T -> T//subsampling_factor
-        #   (2) embedding: num_features -> d_model
-        if vgg_frontend:
-            self.encoder_embed = VggSubsampling(num_features, d_model)
-        else:
-            self.encoder_embed = Conv2dSubsampling(num_features, d_model)
-
-        self.encoder = EmformerEncoder(
-            chunk_length // 4,
-            d_model,
-            nhead,
-            dim_feedforward,
-            num_encoder_layers,
-            dropout,
-            cnn_module_kernel,
-            left_context_length=left_context_length // 4,
-            right_context_length=right_context_length // 4,
-            max_memory_size=max_memory_size,
-            tanh_on_mem=tanh_on_mem,
-            negative_inf=negative_inf,
-            causal=causal,
-        )
-
-        # TODO(fangjun): remove dropout
-        self.encoder_output_layer = nn.Sequential(
-            nn.Dropout(p=dropout), nn.Linear(d_model, output_dim)
-        )
-
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Forward pass for training and non-streaming inference.
-
-        B: batch size;
-        D: feature dimension;
-        T: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (B, T, D).
-          x_lens (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, containing the
-            right_context at the end.
-
-        Returns:
-          (Tensor, Tensor):
-            - output logits, with shape (B, T', D), where
-              T' = ((T - 1) // 2 - 1) // 2 - self.right_context_length // 4.
-            - logits lengths, with shape (B,), without containing the
-              right_context at the end.
-        """
-        x = self.encoder_embed(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-
-        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            x_lens = ((x_lens - 1) // 2 - 1) // 2
-        assert x.size(0) == x_lens.max().item()
-
-        output, output_lengths = self.encoder(x, x_lens)  # (T, N, C)
-
-        logits = self.encoder_output_layer(output)
-        logits = logits.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
-
-        return logits, output_lengths
-
-    @torch.jit.export
-    def infer(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        states: Optional[List[List[torch.Tensor]]] = None,
-        conv_caches: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
-        """Forward pass for streaming inference.
-
-        B: batch size;
-        D: feature dimension;
-        T: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (B, T, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, containing the
-            right_context at the end.
-          states (List[List[torch.Tensor]], optional):
-            Cached states from proceeding chunk's computation, where each
-            element (List[torch.Tensor]) corresponds to each emformer layer.
-            (default: None)
-          conv_caches (List[torch.Tensor], optional):
-            Cached tensors of left context for causal convolution, where each
-            element (Tensor) corresponds to each convolutional layer.
-        Returns:
-          (Tensor, Tensor):
-            - output logits, with shape (B, T', D), where
-              T' = ((T - 1) // 2 - 1) // 2 - self.right_context_length // 4.
-            - logits lengths, with shape (B,), without containing the
-              right_context at the end.
-            - updated states from current chunk's computation.
-            - updated convolution caches from current chunk.
-        """
-        x = self.encoder_embed(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-
-        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            x_lens = ((x_lens - 1) // 2 - 1) // 2
-        assert x.size(0) == x_lens.max().item()
-
-        (
-            output,
-            output_lengths,
-            output_states,
-            output_conv_caches,
-        ) = self.encoder.infer(x, x_lens, states, conv_caches)
-
-        logits = self.encoder_output_layer(output)
-        logits = logits.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
-
-        return logits, output_lengths, output_states, output_conv_caches
-
-
-class ConvolutionModule(nn.Module):
-    """ConvolutionModule in Conformer model.
-    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py  # noqa
-
-    Args:
-      channels (int):
-        The number of channels of conv layers.
-      kernel_size (int):
-        Kernerl size of conv layers.
-      bias (bool):
-        Whether to use bias in conv layers (default=True).
-      causal (bool):
-        Whether use causal convolution (default=False).
-    """
-
-    def __init__(
-        self,
-        channels: int,
-        kernel_size: int,
-        bias: bool = True,
-        causal: bool = True,
-    ) -> None:
-        """Construct an ConvolutionModule object."""
-        super(ConvolutionModule, self).__init__()
-        # kernerl_size should be a odd number for 'SAME' padding
-        assert (kernel_size - 1) % 2 == 0
-
-        self.pointwise_conv1 = nn.Conv1d(
-            channels,
-            2 * channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=bias,
-        )
-
-        # from https://github.com/wenet-e2e/wenet/blob/main/wenet/transformer/convolution.py  # noqa
-        if causal:
-            self.left_padding = kernel_size - 1
-            padding = 0
-        else:
-            self.left_padding = 0
-            padding = (kernel_size - 1) // 2
-        self.depthwise_conv = nn.Conv1d(
-            channels,
-            channels,
-            kernel_size,
-            stride=1,
-            padding=padding,
-            groups=channels,
-            bias=bias,
-        )
-        self.norm = nn.LayerNorm(channels)
-        self.pointwise_conv2 = nn.Conv1d(
-            channels,
-            channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=bias,
-        )
-        self.activation = Swish()
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        cache: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Compute convolution module.
-
-        Args:
-          x (torch.Tensor):
-            Input tensor (#time, batch, channels).
-          cache (torch.Tensor, optional):
-            Cached tensor for left padding (#batch, channels, cache_time).
-        Returns:
-          A tuple of 2 tensors:
-            - output tensor (#time, batch, channels).
-            - updated cache tensor (#batch, channels, cache_time).
-        """
-        # exchange the temporal dimension and the feature dimension
-        x = x.permute(1, 2, 0)  # (#batch, channels, time).
-
-        # 1D Depthwise Conv
-        if self.left_padding > 0:
-            # manualy padding self.lorder zeros to the left
-            # make depthwise_conv causal
-            if cache is None:
-                x = nn.functional.pad(
-                    x, (self.left_padding, 0), "constant", 0.0
-                )
-            else:
-                assert cache.size(0) == x.size(0)  # equal batch
-                assert cache.size(1) == x.size(1)  # equal channel
-                assert cache.size(2) == self.left_padding
-                x = torch.cat([cache, x], dim=2)
-            new_cache = x[:, :, x.size(2) - self.left_padding :]  # noqa
-        else:
-            # It's better we just return None if no cache is requried,
-            # However, for JIT export, here we just fake one tensor instead of
-            # None.
-            new_cache = None
-
-        # GLU mechanism
-        x = self.pointwise_conv1(x)  # (batch, 2*channels, time)
-        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
-
-        x = self.depthwise_conv(x)
-        # x is (batch, channels, time)
-        x = x.permute(0, 2, 1)
-        x = self.norm(x)
-        x = x.permute(0, 2, 1)
-
-        x = self.activation(x)
-
-        x = self.pointwise_conv2(x)  # (batch, channel, time)
-
-        return x.permute(2, 0, 1), new_cache
-
-
-class RelPositionalEncoding(torch.nn.Module):
-    """Relative positional encoding module.
-
-    See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"  # noqa
-    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py  # noqa
-
-    Args:
-        d_model: Embedding dimension.
-        dropout_rate: Dropout rate.
-        max_len: Maximum input length.
-
-    """
-
-    def __init__(
-        self, d_model: int, dropout_rate: float, max_len: int = 5000
-    ) -> None:
-        """Construct an PositionalEncoding object."""
-        super(RelPositionalEncoding, self).__init__()
-        self.d_model = d_model
-        self.xscale = math.sqrt(self.d_model)
-        self.dropout = torch.nn.Dropout(p=dropout_rate)
-        self.pe = None
-        self.pos_len = max_len
-        self.neg_len = max_len
-        self.gen_pe()
-
-    def gen_pe(self) -> None:
-        """Generate the positional encodings."""
-        # Suppose `i` means to the position of query vecotr and `j` means the
-        # position of key vector. We use position relative positions when keys
-        # are to the left (i>j) and negative relative positions otherwise (i<j).
-        pe_positive = torch.zeros(self.pos_len, self.d_model)
-        pe_negative = torch.zeros(self.neg_len, self.d_model)
-        position_positive = torch.arange(
-            0, self.pos_len, dtype=torch.float32
-        ).unsqueeze(1)
-        position_negative = torch.arange(
-            0, self.neg_len, dtype=torch.float32
-        ).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.d_model)
-        )
-        pe_positive[:, 0::2] = torch.sin(position_positive * div_term)
-        pe_positive[:, 1::2] = torch.cos(position_positive * div_term)
-        pe_negative[:, 0::2] = torch.sin(-1 * position_negative * div_term)
-        pe_negative[:, 1::2] = torch.cos(-1 * position_negative * div_term)
-
-        # Reserve the order of positive indices and concat both positive and
-        # negative indices. This is used to support the shifting trick
-        # as in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"  # noqa
-        self.pe_positive = torch.flip(pe_positive, [0])
-        self.pe_negative = pe_negative
-        # self.pe = torch.cat([pe_positive, pe_negative], dim=1)
-
-    def get_pe(
-        self,
-        pos_len: int,
-        neg_len: int,
-        device: torch.device,
-        dtype: torch.dtype,
-    ) -> torch.Tensor:
-        """Get positional encoding given positive length and negative length."""
-        if self.pe_positive.dtype != dtype or str(
-            self.pe_positive.device
-        ) != str(device):
-            self.pe_positive = self.pe_positive.to(dtype=dtype, device=device)
-        if self.pe_negative.dtype != dtype or str(
-            self.pe_negative.device
-        ) != str(device):
-            self.pe_negative = self.pe_negative.to(dtype=dtype, device=device)
-        pe = torch.cat(
-            [
-                self.pe_positive[self.pos_len - pos_len :],
-                self.pe_negative[1:neg_len],
-            ],
-            dim=0,
-        )
-        return pe
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        pos_len: int,
-        neg_len: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Add positional encoding.
-
-        Args:
-            x (torch.Tensor): Input tensor (batch, time, `*`).
-
-        Returns:
-            torch.Tensor: Encoded tensor (batch, time, `*`).
-            torch.Tensor: Encoded tensor (batch, 2*time-1, `*`).
-
-        """
-        x = x * self.xscale
-        if pos_len > self.pos_len or neg_len > self.neg_len:
-            self.pos_len = pos_len
-            self.neg_len = neg_len
-            self.gen_pe()
-        pos_emb = self.get_pe(pos_len, neg_len, x.device, x.dtype)
-        return self.dropout(x), self.dropout(pos_emb)
-
-
-class Swish(torch.nn.Module):
-    """Construct an Swish object."""
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Return Swich activation function."""
-        return x * torch.sigmoid(x)
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/encoder_interface.py b/egs/librispeech/ASR/conv_emformer_transducer/encoder_interface.py
deleted file mode 120000
index aa5d0217a..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/encoder_interface.py
+++ /dev/null
@@ -1 +0,0 @@
-../transducer_stateless/encoder_interface.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/joiner.py b/egs/librispeech/ASR/conv_emformer_transducer/joiner.py
deleted file mode 120000
index 81ad47c55..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/joiner.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/joiner.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/model.py b/egs/librispeech/ASR/conv_emformer_transducer/model.py
deleted file mode 120000
index a61a0a23f..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/model.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/model.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/noam.py b/egs/librispeech/ASR/conv_emformer_transducer/noam.py
deleted file mode 100644
index e46bf35fb..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/noam.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright    2021 University of Chinese Academy of Sciences (author: Han Zhu)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-
-class Noam(object):
-    """
-    Implements Noam optimizer.
-
-    Proposed in
-    "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
-
-    Modified from
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py  # noqa
-
-    Args:
-      params:
-        iterable of parameters to optimize or dicts defining parameter groups
-      model_size:
-        attention dimension of the transformer model
-      factor:
-        learning rate factor
-      warm_step:
-        warmup steps
-    """
-
-    def __init__(
-        self,
-        params,
-        model_size: int = 256,
-        factor: float = 10.0,
-        warm_step: int = 25000,
-        weight_decay=0,
-    ) -> None:
-        """Construct an Noam object."""
-        self.optimizer = torch.optim.Adam(
-            params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay
-        )
-        self._step = 0
-        self.warmup = warm_step
-        self.factor = factor
-        self.model_size = model_size
-        self._rate = 0
-
-    @property
-    def param_groups(self):
-        """Return param_groups."""
-        return self.optimizer.param_groups
-
-    def step(self):
-        """Update parameters and rate."""
-        self._step += 1
-        rate = self.rate()
-        for p in self.optimizer.param_groups:
-            p["lr"] = rate
-        self._rate = rate
-        self.optimizer.step()
-
-    def rate(self, step=None):
-        """Implement `lrate` above."""
-        if step is None:
-            step = self._step
-        return (
-            self.factor
-            * self.model_size ** (-0.5)
-            * min(step ** (-0.5), step * self.warmup ** (-1.5))
-        )
-
-    def zero_grad(self):
-        """Reset gradient."""
-        self.optimizer.zero_grad()
-
-    def state_dict(self):
-        """Return state_dict."""
-        return {
-            "_step": self._step,
-            "warmup": self.warmup,
-            "factor": self.factor,
-            "model_size": self.model_size,
-            "_rate": self._rate,
-            "optimizer": self.optimizer.state_dict(),
-        }
-
-    def load_state_dict(self, state_dict):
-        """Load state_dict."""
-        for key, value in state_dict.items():
-            if key == "optimizer":
-                self.optimizer.load_state_dict(state_dict["optimizer"])
-            else:
-                setattr(self, key, value)
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/subsampling.py b/egs/librispeech/ASR/conv_emformer_transducer/subsampling.py
deleted file mode 120000
index 6fee09e58..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/subsampling.py
+++ /dev/null
@@ -1 +0,0 @@
-../conformer_ctc/subsampling.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/test_emformer.py b/egs/librispeech/ASR/conv_emformer_transducer/test_emformer.py
deleted file mode 100644
index 971abca97..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/test_emformer.py
+++ /dev/null
@@ -1,590 +0,0 @@
-import torch
-
-
-def test_emformer_attention_forward():
-    from emformer import EmformerAttention
-
-    B, D = 2, 256
-    U, R = 12, 2
-    chunk_length = 2
-    attention = EmformerAttention(embed_dim=D, nhead=8)
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S = U // chunk_length
-            M = S - 1
-        else:
-            S, M = 0, 0
-
-        Q, KV = R + U + S, M + R + U
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        summary = torch.randn(S, B, D)
-        memory = torch.randn(M, B, D)
-        attention_mask = torch.rand(Q, KV) >= 0.5
-
-        output_right_context_utterance, output_memory = attention(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            attention_mask,
-        )
-        assert output_right_context_utterance.shape == (R + U, B, D)
-        assert output_memory.shape == (M, B, D)
-
-
-def test_emformer_attention_infer():
-    from emformer import EmformerAttention
-
-    B, D = 2, 256
-    R, L = 4, 2
-    chunk_length = 2
-    U = chunk_length
-    attention = EmformerAttention(embed_dim=D, nhead=8)
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S, M = 1, 3
-        else:
-            S, M = 0, 0
-
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        summary = torch.randn(S, B, D)
-        memory = torch.randn(M, B, D)
-        left_context_key = torch.randn(L, B, D)
-        left_context_val = torch.randn(L, B, D)
-
-        (
-            output_right_context_utterance,
-            output_memory,
-            next_key,
-            next_val,
-        ) = attention.infer(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            left_context_key,
-            left_context_val,
-        )
-        assert output_right_context_utterance.shape == (R + U, B, D)
-        assert output_memory.shape == (S, B, D)
-        assert next_key.shape == (L + U, B, D)
-        assert next_val.shape == (L + U, B, D)
-
-
-def test_emformer_layer_forward():
-    from emformer import EmformerLayer
-
-    B, D = 2, 256
-    U, R, L = 12, 2, 5
-    chunk_length = 2
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S = U // chunk_length
-            M = S - 1
-        else:
-            S, M = 0, 0
-
-        layer = EmformerLayer(
-            d_model=D,
-            nhead=8,
-            dim_feedforward=1024,
-            chunk_length=chunk_length,
-            cnn_module_kernel=3,
-            left_context_length=L,
-            max_memory_size=M,
-            causal=True,
-        )
-
-        Q, KV = R + U + S, M + R + U
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        memory = torch.randn(M, B, D)
-        attention_mask = torch.rand(Q, KV) >= 0.5
-
-        output_utterance, output_right_context, output_memory = layer(
-            utterance,
-            lengths,
-            right_context,
-            memory,
-            attention_mask,
-        )
-        assert output_utterance.shape == (U, B, D)
-        assert output_right_context.shape == (R, B, D)
-        assert output_memory.shape == (M, B, D)
-
-
-def test_emformer_layer_infer():
-    from emformer import EmformerLayer
-
-    B, D = 2, 256
-    R, L = 2, 5
-    chunk_length = 2
-    U = chunk_length
-    K = 3
-
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-
-        layer = EmformerLayer(
-            d_model=D,
-            nhead=8,
-            dim_feedforward=1024,
-            chunk_length=chunk_length,
-            cnn_module_kernel=K,
-            left_context_length=L,
-            max_memory_size=M,
-            causal=True,
-        )
-
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        memory = torch.randn(M, B, D)
-        state = None
-        conv_cache = None
-        (
-            output_utterance,
-            output_right_context,
-            output_memory,
-            output_state,
-            output_conv_cache,
-        ) = layer.infer(
-            utterance, lengths, right_context, memory, state, conv_cache
-        )
-        assert output_utterance.shape == (U, B, D)
-        assert output_right_context.shape == (R, B, D)
-        if use_memory:
-            assert output_memory.shape == (1, B, D)
-        else:
-            assert output_memory.shape == (0, B, D)
-        assert len(output_state) == 4
-        assert output_state[0].shape == (M, B, D)
-        assert output_state[1].shape == (L, B, D)
-        assert output_state[2].shape == (L, B, D)
-        assert output_state[3].shape == (1, B)
-        assert output_conv_cache.shape == (B, D, K - 1)
-
-
-def test_emformer_encoder_forward():
-    from emformer import EmformerEncoder
-
-    B, D = 2, 256
-    U, R, L = 12, 2, 5
-    chunk_length = 2
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S = U // chunk_length
-            M = S - 1
-        else:
-            S, M = 0, 0
-
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=2,
-            cnn_module_kernel=3,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            causal=True,
-        )
-
-        x = torch.randn(U + R, B, D)
-        lengths = torch.randint(1, U + R + 1, (B,))
-        lengths[0] = U + R
-
-        output, output_lengths = encoder(x, lengths)
-        assert output.shape == (U, B, D)
-        assert torch.equal(output_lengths, torch.clamp(lengths - R, min=0))
-
-
-def test_emformer_encoder_infer():
-    from emformer import EmformerEncoder
-
-    B, D = 2, 256
-    R, L = 2, 5
-    chunk_length = 2
-    U = chunk_length
-    num_chunks = 3
-    num_encoder_layers = 2
-    K = 3
-
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=num_encoder_layers,
-            cnn_module_kernel=K,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            causal=True,
-        )
-
-        states = None
-        conv_caches = None
-        for chunk_idx in range(num_chunks):
-            x = torch.randn(U + R, B, D)
-            lengths = torch.randint(1, U + R + 1, (B,))
-            lengths[0] = U + R
-            output, output_lengths, states, conv_caches = encoder.infer(
-                x, lengths, states, conv_caches
-            )
-            assert output.shape == (U, B, D)
-            assert torch.equal(output_lengths, torch.clamp(lengths - R, min=0))
-            assert len(states) == num_encoder_layers
-            for state in states:
-                assert len(state) == 4
-                assert state[0].shape == (M, B, D)
-                assert state[1].shape == (L, B, D)
-                assert state[2].shape == (L, B, D)
-                assert torch.equal(
-                    state[3], (chunk_idx + 1) * U * torch.ones_like(state[3])
-                )
-            for conv_cache in conv_caches:
-                assert conv_cache.shape == (B, D, K - 1)
-
-
-def test_emformer_forward():
-    from emformer import Emformer
-
-    num_features = 80
-    output_dim = 1000
-    chunk_length = 8
-    L, R = 128, 4
-    B, D, U = 2, 256, 80
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-        model = Emformer(
-            num_features=num_features,
-            output_dim=output_dim,
-            chunk_length=chunk_length,
-            subsampling_factor=4,
-            d_model=D,
-            cnn_module_kernel=3,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            vgg_frontend=False,
-            causal=True,
-        )
-        x = torch.randn(B, U + R + 3, num_features)
-        x_lens = torch.randint(1, U + R + 3 + 1, (B,))
-        x_lens[0] = U + R + 3
-        logits, output_lengths = model(x, x_lens)
-        assert logits.shape == (B, U // 4, output_dim)
-        assert torch.equal(
-            output_lengths,
-            torch.clamp(((x_lens - 1) // 2 - 1) // 2 - R // 4, min=0),
-        )
-
-
-def test_emformer_infer():
-    from emformer import Emformer
-
-    num_features = 80
-    output_dim = 1000
-    chunk_length = 8
-    U = chunk_length
-    L, R = 128, 4
-    B, D = 2, 256
-    num_chunks = 3
-    num_encoder_layers = 2
-    K = 3
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-        model = Emformer(
-            num_features=num_features,
-            output_dim=output_dim,
-            chunk_length=chunk_length,
-            subsampling_factor=4,
-            d_model=D,
-            num_encoder_layers=num_encoder_layers,
-            cnn_module_kernel=K,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            vgg_frontend=False,
-            causal=True,
-        )
-        states = None
-        conv_caches = None
-        for chunk_idx in range(num_chunks):
-            x = torch.randn(B, U + R + 3, num_features)
-            x_lens = torch.randint(1, U + R + 3 + 1, (B,))
-            x_lens[0] = U + R + 3
-            logits, output_lengths, states, conv_caches = model.infer(
-                x, x_lens, states, conv_caches
-            )
-            assert logits.shape == (B, U // 4, output_dim)
-            assert torch.equal(
-                output_lengths,
-                torch.clamp(((x_lens - 1) // 2 - 1) // 2 - R // 4, min=0),
-            )
-            assert len(states) == num_encoder_layers
-            for state in states:
-                assert len(state) == 4
-                assert state[0].shape == (M, B, D)
-                assert state[1].shape == (L // 4, B, D)
-                assert state[2].shape == (L // 4, B, D)
-                assert torch.equal(
-                    state[3],
-                    U // 4 * (chunk_idx + 1) * torch.ones_like(state[3]),
-                )
-            for conv_cache in conv_caches:
-                assert conv_cache.shape == (B, D, K - 1)
-
-
-def test_emformer_encoder_layer_forward_infer_consistency():
-    from emformer import EmformerEncoder
-
-    chunk_length = 4
-    num_chunks = 3
-    U = chunk_length * num_chunks
-    L, R = 1, 2
-    D = 256
-    num_encoder_layers = 1
-    memory_sizes = [0, 3]
-    K = 3
-
-    for M in memory_sizes:
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            dropout=0.1,
-            cnn_module_kernel=K,
-            causal=True,
-        )
-        encoder.eval()
-        encoder_layer = encoder.emformer_layers[0]
-
-        x = torch.randn(U + R, 1, D)
-        lengths = torch.tensor([U])
-        right_context = encoder._gen_right_context(x)
-        utterance = x[: x.size(0) - R]
-        attention_mask = encoder._gen_attention_mask(utterance)
-        memory = (
-            encoder.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[
-                :-1
-            ]
-            if encoder.use_memory
-            else torch.empty(0).to(dtype=x.dtype, device=x.device)
-        )
-        (
-            forward_output_utterance,
-            forward_output_right_context,
-            forward_output_memory,
-        ) = encoder_layer(
-            utterance,
-            lengths,
-            right_context,
-            memory,
-            attention_mask,
-        )
-
-        state = None
-        conv_cache = None
-        for chunk_idx in range(num_chunks):
-            start_idx = chunk_idx * chunk_length
-            end_idx = start_idx + chunk_length
-            chunk = x[start_idx:end_idx]
-            chunk_right_context = x[end_idx : end_idx + R]  # noqa
-            chunk_length = torch.tensor([chunk_length])
-            chunk_memory = (
-                encoder.init_memory_op(chunk.permute(1, 2, 0)).permute(2, 0, 1)
-                if encoder.use_memory
-                else torch.empty(0).to(dtype=x.dtype, device=x.device)
-            )
-            (
-                infer_output_chunk,
-                infer_right_context,
-                infer_output_memory,
-                state,
-                conv_cache,
-            ) = encoder_layer.infer(
-                chunk,
-                chunk_length,
-                chunk_right_context,
-                chunk_memory,
-                state,
-                conv_cache,
-            )
-            forward_output_chunk = forward_output_utterance[start_idx:end_idx]
-            assert torch.allclose(
-                infer_output_chunk,
-                forward_output_chunk,
-                atol=1e-5,
-                rtol=0.0,
-            )
-
-
-def test_emformer_encoder_forward_infer_consistency():
-    from emformer import EmformerEncoder
-
-    chunk_length = 4
-    num_chunks = 3
-    U = chunk_length * num_chunks
-    L, R = 1, 2
-    D = 256
-    num_encoder_layers = 3
-    K = 3
-    memory_sizes = [0, 3]
-
-    for M in memory_sizes:
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            dropout=0.1,
-            cnn_module_kernel=K,
-            causal=True,
-        )
-        encoder.eval()
-
-        x = torch.randn(U + R, 1, D)
-        lengths = torch.tensor([U + R])
-
-        forward_output, forward_output_lengths = encoder(x, lengths)
-
-        states = None
-        conv_caches = None
-        for chunk_idx in range(num_chunks):
-            start_idx = chunk_idx * chunk_length
-            end_idx = start_idx + chunk_length
-            chunk = x[start_idx : end_idx + R]  # noqa
-            chunk_right_context = x[end_idx : end_idx + R]  # noqa
-            chunk_length = torch.tensor([chunk_length])
-            (
-                infer_output_chunk,
-                infer_output_lengths,
-                states,
-                conv_caches,
-            ) = encoder.infer(
-                chunk,
-                chunk_length,
-                states,
-                conv_caches,
-            )
-            forward_output_chunk = forward_output[start_idx:end_idx]
-            assert torch.allclose(
-                infer_output_chunk,
-                forward_output_chunk,
-                atol=1e-5,
-                rtol=0.0,
-            )
-
-
-def test_emformer_forward_infer_consistency():
-    from emformer import Emformer
-
-    num_features = 80
-    output_dim = 1000
-    chunk_length = 8
-    num_chunks = 3
-    U = chunk_length * num_chunks
-    L, R = 128, 4
-    D = 256
-    num_encoder_layers = 2
-    K = 3
-    memory_sizes = [0, 3]
-
-    for M in memory_sizes:
-        model = Emformer(
-            num_features=num_features,
-            output_dim=output_dim,
-            chunk_length=chunk_length,
-            subsampling_factor=4,
-            d_model=D,
-            num_encoder_layers=num_encoder_layers,
-            cnn_module_kernel=K,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            dropout=0.1,
-            vgg_frontend=False,
-            causal=True,
-        )
-        model.eval()
-
-        x = torch.randn(1, U + R + 3, num_features)
-        x_lens = torch.tensor([x.size(1)])
-
-        # forward mode
-        forward_logits, _ = model(x, x_lens)
-
-        states = None
-        conv_caches = None
-        for chunk_idx in range(num_chunks):
-            start_idx = chunk_idx * chunk_length
-            end_idx = start_idx + chunk_length
-            chunk = x[:, start_idx : end_idx + R + 3]  # noqa
-            lengths = torch.tensor([chunk.size(1)])
-            (
-                infer_chunk_logits,
-                output_lengths,
-                states,
-                conv_caches,
-            ) = model.infer(chunk, lengths, states, conv_caches)
-            forward_chunk_logits = forward_logits[
-                :, start_idx // 4 : end_idx // 4  # noqa
-            ]
-            assert torch.allclose(
-                infer_chunk_logits,
-                forward_chunk_logits,
-                atol=1e-5,
-                rtol=0.0,
-            )
-
-
-if __name__ == "__main__":
-    test_emformer_attention_forward()
-    test_emformer_attention_infer()
-    test_emformer_layer_forward()
-    test_emformer_layer_infer()
-    test_emformer_encoder_forward()
-    test_emformer_encoder_infer()
-    test_emformer_forward()
-    test_emformer_infer()
-    test_emformer_encoder_layer_forward_infer_consistency()
-    test_emformer_encoder_forward_infer_consistency()
-    test_emformer_forward_infer_consistency()
diff --git a/egs/librispeech/ASR/conv_emformer_transducer/train.py b/egs/librispeech/ASR/conv_emformer_transducer/train.py
deleted file mode 100755
index 5152be1a1..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer/train.py
+++ /dev/null
@@ -1,1016 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
-#                                                  Wei Kang
-#                                                  Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./transducer_emformer/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 0 \
-  --exp-dir transducer_emformer/exp \
-  --full-libri 1 \
-  --max-duration 300
-"""
-
-
-import argparse
-import logging
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
-from decoder import Decoder
-from emformer import Emformer
-from joiner import Joiner
-from lhotse.cut import Cut
-from lhotse.dataset.sampling.base import CutSampler
-from lhotse.utils import fix_random_seed
-from model import Transducer
-from noam import Noam
-from torch import Tensor
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.utils import clip_grad_norm_
-from torch.utils.tensorboard import SummaryWriter
-
-from icefall.checkpoint import load_checkpoint, remove_checkpoints
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import save_checkpoint_with_global_batch_idx
-from icefall.dist import cleanup_dist, setup_dist
-from icefall.env import get_env_info
-from icefall.utils import (
-    AttributeDict,
-    MetricsTracker,
-    measure_gradient_norms,
-    measure_weight_norms,
-    optim_step_and_measure_param_change,
-    setup_logger,
-    str2bool,
-)
-
-
-def add_model_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument(
-        "--attention-dim",
-        type=int,
-        default=512,
-        help="Attention dim for the Emformer",
-    )
-
-    parser.add_argument(
-        "--nhead",
-        type=int,
-        default=8,
-        help="Number of attention heads for the Emformer",
-    )
-
-    parser.add_argument(
-        "--dim-feedforward",
-        type=int,
-        default=2048,
-        help="Feed-forward dimension for the Emformer",
-    )
-
-    parser.add_argument(
-        "--num-encoder-layers",
-        type=int,
-        default=12,
-        help="Number of encoder layers for the Emformer",
-    )
-
-    parser.add_argument(
-        "--cnn-module-kernel",
-        type=int,
-        default=3,
-        help="Kernel size for the convolution module.",
-    )
-
-    parser.add_argument(
-        "--left-context-length",
-        type=int,
-        default=120,
-        help="Number of frames for the left context in the Emformer",
-    )
-
-    parser.add_argument(
-        "--chunk-length",
-        type=int,
-        default=16,
-        help="Number of frames for each segment in the Emformer",
-    )
-
-    parser.add_argument(
-        "--right-context-length",
-        type=int,
-        default=4,
-        help="Number of frames for right context in the Emformer",
-    )
-
-    parser.add_argument(
-        "--memory-size",
-        type=int,
-        default=0,
-        help="Number of entries in the memory for the Emformer",
-    )
-
-    parser.add_argument(
-        "--causal-conv",
-        type=str2bool,
-        default=True,
-        help="Whether use causal convolution.",
-    )
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--world-size",
-        type=int,
-        default=1,
-        help="Number of GPUs for DDP training.",
-    )
-
-    parser.add_argument(
-        "--master-port",
-        type=int,
-        default=12354,
-        help="Master port to use for DDP training.",
-    )
-
-    parser.add_argument(
-        "--tensorboard",
-        type=str2bool,
-        default=True,
-        help="Should various information be logged in tensorboard.",
-    )
-
-    parser.add_argument(
-        "--num-epochs",
-        type=int,
-        default=30,
-        help="Number of epochs to train.",
-    )
-
-    parser.add_argument(
-        "--start-epoch",
-        type=int,
-        default=0,
-        help="""Resume training from from this epoch.
-        If it is positive, it will load checkpoint from
-        transducer_emformer/exp/epoch-{start_epoch-1}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--start-batch",
-        type=int,
-        default=0,
-        help="""If positive, --start-epoch is ignored and
-        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="transducer_emformer/exp",
-        help="""The experiment dir.
-        It specifies the directory where all training related
-        files, e.g., checkpoints, log, etc, are saved
-        """,
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--lr-factor",
-        type=float,
-        default=5.0,
-        help="The lr_factor for Noam optimizer",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
-    )
-
-    parser.add_argument(
-        "--prune-range",
-        type=int,
-        default=5,
-        help="The prune range for rnnt loss, it means how many symbols(context)"
-        "we are using to compute the loss",
-    )
-
-    parser.add_argument(
-        "--lm-scale",
-        type=float,
-        default=0.25,
-        help="The scale to smooth the loss with lm "
-        "(output of prediction network) part.",
-    )
-
-    parser.add_argument(
-        "--am-scale",
-        type=float,
-        default=0.0,
-        help="The scale to smooth the loss with am (output of encoder network)"
-        "part.",
-    )
-
-    parser.add_argument(
-        "--simple-loss-scale",
-        type=float,
-        default=0.5,
-        help="To get pruning ranges, we will calculate a simple version"
-        "loss(joiner is just addition), this simple loss also uses for"
-        "training (as a regularization item). We will scale the simple loss"
-        "with this parameter before adding to the final loss.",
-    )
-
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="The seed for random generators intended for reproducibility",
-    )
-
-    parser.add_argument(
-        "--save-every-n",
-        type=int,
-        default=8000,
-        help="""Save checkpoint after processing this number of batches"
-        periodically. We save checkpoint to exp-dir/ whenever
-        params.batch_idx_train % save_every_n == 0. The checkpoint filename
-        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
-        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
-        end of each epoch where `xxx` is the epoch number counting from 0.
-        """,
-    )
-
-    parser.add_argument(
-        "--keep-last-k",
-        type=int,
-        default=20,
-        help="""Only keep this number of checkpoints on disk.
-        For instance, if it is 3, there are only 3 checkpoints
-        in the exp-dir with filenames `checkpoint-xxx.pt`.
-        It does not affect checkpoints with name `epoch-xxx.pt`.
-        """,
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    """Return a dict containing training parameters.
-
-    All training related parameters that are not passed from the commandline
-    are saved in the variable `params`.
-
-    Commandline options are merged into `params` after they are parsed, so
-    you can also access them via `params`.
-
-    Explanation of options saved in `params`:
-
-        - best_train_loss: Best training loss so far. It is used to select
-                           the model that has the lowest training loss. It is
-                           updated during the training.
-
-        - best_valid_loss: Best validation loss so far. It is used to select
-                           the model that has the lowest validation loss. It is
-                           updated during the training.
-
-        - best_train_epoch: It is the epoch that has the best training loss.
-
-        - best_valid_epoch: It is the epoch that has the best validation loss.
-
-        - batch_idx_train: Used to writing statistics to tensorboard. It
-                           contains number of batches trained so far across
-                           epochs.
-
-        - log_interval:  Print training loss if batch_idx % log_interval` is 0
-
-        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
-        - valid_interval:  Run validation if batch_idx % valid_interval is 0
-
-        - feature_dim: The model input dim. It has to match the one used
-                       in computing features.
-
-        - subsampling_factor:  The subsampling factor for the model.
-
-        - attention_dim: Hidden dim for multi-head attention model.
-
-        - num_decoder_layers: Number of decoder layer of transformer decoder.
-
-        - warm_step: The warm_step for Noam optimizer.
-    """
-    params = AttributeDict(
-        {
-            "best_train_loss": float("inf"),
-            "best_valid_loss": float("inf"),
-            "best_train_epoch": -1,
-            "best_valid_epoch": -1,
-            "batch_idx_train": 0,
-            "log_interval": 50,
-            "reset_interval": 200,
-            "valid_interval": 3000,  # For the 100h subset, use 800
-            "log_diagnostics": False,
-            # parameters for Emformer
-            "feature_dim": 80,
-            "subsampling_factor": 4,
-            "vgg_frontend": False,
-            # parameters for decoder
-            "embedding_dim": 512,
-            # parameters for Noam
-            "warm_step": 80000,  # For the 100h subset, use 20000
-            "env_info": get_env_info(),
-        }
-    )
-
-    return params
-
-
-def get_encoder_model(params: AttributeDict) -> nn.Module:
-    encoder = Emformer(
-        num_features=params.feature_dim,
-        output_dim=params.vocab_size,
-        subsampling_factor=params.subsampling_factor,
-        d_model=params.attention_dim,
-        nhead=params.nhead,
-        dim_feedforward=params.dim_feedforward,
-        num_encoder_layers=params.num_encoder_layers,
-        cnn_module_kernel=params.cnn_module_kernel,
-        vgg_frontend=params.vgg_frontend,
-        left_context_length=params.left_context_length,
-        chunk_length=params.chunk_length,
-        right_context_length=params.right_context_length,
-        max_memory_size=params.memory_size,
-        causal=params.causal_conv,
-    )
-    return encoder
-
-
-def get_decoder_model(params: AttributeDict) -> nn.Module:
-    decoder = Decoder(
-        vocab_size=params.vocab_size,
-        embedding_dim=params.embedding_dim,
-        blank_id=params.blank_id,
-        unk_id=params.unk_id,
-        context_size=params.context_size,
-    )
-    return decoder
-
-
-def get_joiner_model(params: AttributeDict) -> nn.Module:
-    joiner = Joiner(
-        input_dim=params.vocab_size,
-        inner_dim=params.embedding_dim,
-        output_dim=params.vocab_size,
-    )
-    return joiner
-
-
-def get_transducer_model(params: AttributeDict) -> nn.Module:
-    encoder = get_encoder_model(params)
-    decoder = get_decoder_model(params)
-    joiner = get_joiner_model(params)
-
-    model = Transducer(
-        encoder=encoder,
-        decoder=decoder,
-        joiner=joiner,
-    )
-    return model
-
-
-def load_checkpoint_if_available(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-) -> Optional[Dict[str, Any]]:
-    """Load checkpoint from file.
-
-    If params.start_batch is positive, it will load the checkpoint from
-    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
-    params.start_epoch is positive, it will load the checkpoint from
-    `params.start_epoch - 1`.
-
-    Apart from loading state dict for `model` and `optimizer` it also updates
-    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
-    and `best_valid_loss` in `params`.
-
-    Args:
-      params:
-        The return value of :func:`get_params`.
-      model:
-        The training model.
-      optimizer:
-        The optimizer that we are using.
-    Returns:
-      Return a dict containing previously saved training info.
-    """
-    if params.start_batch > 0:
-        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
-    elif params.start_epoch > 0:
-        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
-    else:
-        return None
-
-    assert filename.is_file(), f"{filename} does not exist!"
-
-    saved_params = load_checkpoint(
-        filename,
-        model=model,
-        optimizer=optimizer,
-    )
-
-    keys = [
-        "best_train_epoch",
-        "best_valid_epoch",
-        "batch_idx_train",
-        "best_train_loss",
-        "best_valid_loss",
-    ]
-    for k in keys:
-        params[k] = saved_params[k]
-
-    if params.start_batch > 0:
-        if "cur_epoch" in saved_params:
-            params["start_epoch"] = saved_params["cur_epoch"]
-
-        if "cur_batch_idx" in saved_params:
-            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
-
-    return saved_params
-
-
-def save_checkpoint(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    sampler: Optional[CutSampler] = None,
-    rank: int = 0,
-) -> None:
-    """Save model, optimizer, scheduler and training stats to file.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The training model.
-      optimizer:
-        The optimizer used in the training.
-      sampler:
-       The sampler for the training dataset.
-    """
-    if rank != 0:
-        return
-    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
-    save_checkpoint_impl(
-        filename=filename,
-        model=model,
-        params=params,
-        optimizer=optimizer,
-        sampler=sampler,
-        rank=rank,
-    )
-
-    if params.best_train_epoch == params.cur_epoch:
-        best_train_filename = params.exp_dir / "best-train-loss.pt"
-        copyfile(src=filename, dst=best_train_filename)
-
-    if params.best_valid_epoch == params.cur_epoch:
-        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
-        copyfile(src=filename, dst=best_valid_filename)
-
-
-def compute_loss(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    batch: dict,
-    is_training: bool,
-) -> Tuple[Tensor, MetricsTracker]:
-    """
-    Compute CTC loss given the model and its inputs.
-
-    Args:
-      params:
-        Parameters for training. See :func:`get_params`.
-      model:
-        The model for training. It is an instance of Emformer in our case.
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      is_training:
-        True for training. False for validation. When it is True, this
-        function enables autograd during computation; when it is False, it
-        disables autograd.
-    """
-    device = model.device
-    feature = batch["inputs"]
-    # at entry, feature is (N, T, C)
-    assert feature.ndim == 3
-    feature = feature.to(device)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    texts = batch["supervisions"]["text"]
-    y = sp.encode(texts, out_type=int)
-    y = k2.RaggedTensor(y).to(device)
-
-    with torch.set_grad_enabled(is_training):
-        simple_loss, pruned_loss = model(
-            x=feature,
-            x_lens=feature_lens,
-            y=y,
-            prune_range=params.prune_range,
-            am_scale=params.am_scale,
-            lm_scale=params.lm_scale,
-        )
-        loss = params.simple_loss_scale * simple_loss + pruned_loss
-
-    assert loss.requires_grad == is_training
-
-    info = MetricsTracker()
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        info["frames"] = (
-            (feature_lens // params.subsampling_factor).sum().item()
-        )
-
-    # Note: We use reduction=sum while computing the loss.
-    info["loss"] = loss.detach().cpu().item()
-    info["simple_loss"] = simple_loss.detach().cpu().item()
-    info["pruned_loss"] = pruned_loss.detach().cpu().item()
-
-    return loss, info
-
-
-def compute_validation_loss(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    valid_dl: torch.utils.data.DataLoader,
-    world_size: int = 1,
-) -> MetricsTracker:
-    """Run the validation process."""
-    model.eval()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(valid_dl):
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            sp=sp,
-            batch=batch,
-            is_training=False,
-        )
-        assert loss.requires_grad is False
-        tot_loss = tot_loss + loss_info
-
-    if world_size > 1:
-        tot_loss.reduce(loss.device)
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    if loss_value < params.best_valid_loss:
-        params.best_valid_epoch = params.cur_epoch
-        params.best_valid_loss = loss_value
-
-    return tot_loss
-
-
-def train_one_epoch(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    sp: spm.SentencePieceProcessor,
-    train_dl: torch.utils.data.DataLoader,
-    valid_dl: torch.utils.data.DataLoader,
-    tb_writer: Optional[SummaryWriter] = None,
-    world_size: int = 1,
-    rank: int = 0,
-) -> None:
-    """Train the model for one epoch.
-
-    The training loss from the mean of all frames is saved in
-    `params.train_loss`. It runs the validation process every
-    `params.valid_interval` batches.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The model for training.
-      optimizer:
-        The optimizer we are using.
-      train_dl:
-        Dataloader for the training dataset.
-      valid_dl:
-        Dataloader for the validation dataset.
-      tb_writer:
-        Writer to write log messages to tensorboard.
-      world_size:
-        Number of nodes in DDP training. If it is 1, DDP is disabled.
-      rank:
-        The rank of the node in DDP training. If no DDP is used, it should
-        be set to 0.
-    """
-    model.train()
-
-    tot_loss = MetricsTracker()
-
-    def maybe_log_gradients(tag: str):
-        if (
-            params.log_diagnostics
-            and tb_writer is not None
-            and params.batch_idx_train % (params.log_interval * 5) == 0
-        ):
-            tb_writer.add_scalars(
-                tag,
-                measure_gradient_norms(model, norm="l2"),
-                global_step=params.batch_idx_train,
-            )
-
-    def maybe_log_weights(tag: str):
-        if (
-            params.log_diagnostics
-            and tb_writer is not None
-            and params.batch_idx_train % (params.log_interval * 5) == 0
-        ):
-            tb_writer.add_scalars(
-                tag,
-                measure_weight_norms(model, norm="l2"),
-                global_step=params.batch_idx_train,
-            )
-
-    def maybe_log_param_relative_changes():
-        if (
-            params.log_diagnostics
-            and tb_writer is not None
-            and params.batch_idx_train % (params.log_interval * 5) == 0
-        ):
-            deltas = optim_step_and_measure_param_change(model, optimizer)
-            tb_writer.add_scalars(
-                "train/relative_param_change_per_minibatch",
-                deltas,
-                global_step=params.batch_idx_train,
-            )
-        else:
-            optimizer.step()
-
-    cur_batch_idx = params.get("cur_batch_idx", 0)
-
-    for batch_idx, batch in enumerate(train_dl):
-        if batch_idx < cur_batch_idx:
-            continue
-        cur_batch_idx = batch_idx
-
-        params.batch_idx_train += 1
-        batch_size = len(batch["supervisions"]["text"])
-
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            sp=sp,
-            batch=batch,
-            is_training=True,
-        )
-        # summary stats
-        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-        # NOTE: We use reduction==sum and loss is computed over utterances
-        # in the batch and there is no normalization to it so far.
-
-        loss.backward()
-
-        maybe_log_weights("train/param_norms")
-        maybe_log_gradients("train/grad_norms")
-        maybe_log_param_relative_changes()
-
-        optimizer.zero_grad()
-
-        if (
-            params.batch_idx_train > 0
-            and params.batch_idx_train % params.save_every_n == 0
-        ):
-            params.cur_batch_idx = batch_idx
-            save_checkpoint_with_global_batch_idx(
-                out_dir=params.exp_dir,
-                global_batch_idx=params.batch_idx_train,
-                model=model,
-                params=params,
-                optimizer=optimizer,
-                sampler=train_dl.sampler,
-                rank=rank,
-            )
-            del params.cur_batch_idx
-            remove_checkpoints(
-                out_dir=params.exp_dir,
-                topk=params.keep_last_k,
-                rank=rank,
-            )
-
-        if batch_idx % params.log_interval == 0:
-            logging.info(
-                f"Epoch {params.cur_epoch}, "
-                f"batch {batch_idx}, loss[{loss_info}], "
-                f"tot_loss[{tot_loss}], batch size: {batch_size}"
-            )
-
-            if tb_writer is not None:
-                loss_info.write_summary(
-                    tb_writer, "train/current_", params.batch_idx_train
-                )
-                tot_loss.write_summary(
-                    tb_writer, "train/tot_", params.batch_idx_train
-                )
-
-        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
-            logging.info("Computing validation loss")
-            valid_info = compute_validation_loss(
-                params=params,
-                model=model,
-                sp=sp,
-                valid_dl=valid_dl,
-                world_size=world_size,
-            )
-            model.train()
-            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
-            if tb_writer is not None:
-                valid_info.write_summary(
-                    tb_writer, "train/valid_", params.batch_idx_train
-                )
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    params.train_loss = loss_value
-    if params.train_loss < params.best_train_loss:
-        params.best_train_epoch = params.cur_epoch
-        params.best_train_loss = params.train_loss
-
-
-def run(rank, world_size, args):
-    """
-    Args:
-      rank:
-        It is a value between 0 and `world_size-1`, which is
-        passed automatically by `mp.spawn()` in :func:`main`.
-        The node with rank 0 is responsible for saving checkpoint.
-      world_size:
-        Number of GPUs for DDP training.
-      args:
-        The return value of get_parser().parse_args()
-    """
-    params = get_params()
-    params.update(vars(args))
-    if params.full_libri is False:
-        params.valid_interval = 800
-        params.warm_step = 20000
-
-    fix_random_seed(params.seed)
-    if world_size > 1:
-        setup_dist(rank, world_size, params.master_port)
-
-    setup_logger(f"{params.exp_dir}/log/log-train")
-    logging.info("Training started")
-
-    if args.tensorboard and rank == 0:
-        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
-    else:
-        tb_writer = None
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", rank)
-    logging.info(f"Device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_transducer_model(params)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    checkpoints = load_checkpoint_if_available(params=params, model=model)
-
-    model.to(device)
-    if world_size > 1:
-        logging.info("Using DDP")
-        model = DDP(model, device_ids=[rank])
-    model.device = device
-
-    optimizer = Noam(
-        model.parameters(),
-        model_size=params.attention_dim,
-        factor=params.lr_factor,
-        warm_step=params.warm_step,
-    )
-
-    if checkpoints and "optimizer" in checkpoints:
-        logging.info("Loading optimizer state dict")
-        optimizer.load_state_dict(checkpoints["optimizer"])
-
-    librispeech = LibriSpeechAsrDataModule(args)
-
-    train_cuts = librispeech.train_clean_100_cuts()
-    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
-
-    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 20 seconds
-        #
-        # Caution: There is a reason to select 20.0 here. Please see
-        # ../local/display_manifest_statistics.py
-        #
-        # You should use ../local/display_manifest_statistics.py to get
-        # an utterance duration distribution for your dataset to select
-        # the threshold
-        return 1.0 <= c.duration <= 20.0
-
-    num_in_total = len(train_cuts)
-
-    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-
-    num_left = len(train_cuts)
-    num_removed = num_in_total - num_left
-    removed_percent = num_removed / num_in_total * 100
-
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
-    logging.info(f"After removing short and long utterances: {num_left}")
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
-
-    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
-        # We only load the sampler's state dict when it loads a checkpoint
-        # saved in the middle of an epoch
-        sampler_state_dict = checkpoints["sampler"]
-    else:
-        sampler_state_dict = None
-
-    train_dl = librispeech.train_dataloaders(
-        train_cuts, sampler_state_dict=sampler_state_dict
-    )
-
-    valid_cuts = librispeech.dev_clean_cuts()
-    valid_cuts += librispeech.dev_other_cuts()
-    valid_dl = librispeech.valid_dataloaders(valid_cuts)
-
-    scan_pessimistic_batches_for_oom(
-        model=model,
-        train_dl=train_dl,
-        optimizer=optimizer,
-        sp=sp,
-        params=params,
-    )
-
-    for epoch in range(params.start_epoch, params.num_epochs):
-        fix_random_seed(params.seed + epoch)
-        train_dl.sampler.set_epoch(epoch)
-
-        cur_lr = optimizer._rate
-        if tb_writer is not None:
-            tb_writer.add_scalar(
-                "train/learning_rate", cur_lr, params.batch_idx_train
-            )
-            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
-        if rank == 0:
-            logging.info("epoch {}, learning rate {}".format(epoch, cur_lr))
-
-        params.cur_epoch = epoch
-
-        train_one_epoch(
-            params=params,
-            model=model,
-            optimizer=optimizer,
-            sp=sp,
-            train_dl=train_dl,
-            valid_dl=valid_dl,
-            tb_writer=tb_writer,
-            world_size=world_size,
-            rank=rank,
-        )
-
-        save_checkpoint(
-            params=params,
-            model=model,
-            optimizer=optimizer,
-            sampler=train_dl.sampler,
-            rank=rank,
-        )
-
-    logging.info("Done!")
-
-    if world_size > 1:
-        torch.distributed.barrier()
-        cleanup_dist()
-
-
-def scan_pessimistic_batches_for_oom(
-    model: nn.Module,
-    train_dl: torch.utils.data.DataLoader,
-    optimizer: torch.optim.Optimizer,
-    sp: spm.SentencePieceProcessor,
-    params: AttributeDict,
-):
-    from lhotse.dataset import find_pessimistic_batches
-
-    logging.info(
-        "Sanity check -- see if any of the batches in epoch 0 would cause OOM."
-    )
-    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
-    for criterion, cuts in batches.items():
-        batch = train_dl.dataset[cuts]
-        try:
-            optimizer.zero_grad()
-            loss, _ = compute_loss(
-                params=params,
-                model=model,
-                sp=sp,
-                batch=batch,
-                is_training=True,
-            )
-            loss.backward()
-            clip_grad_norm_(model.parameters(), 5.0, 2.0)
-            optimizer.step()
-        except RuntimeError as e:
-            if "CUDA out of memory" in str(e):
-                logging.error(
-                    "Your GPU ran out of memory with the current "
-                    "max_duration setting. We recommend decreasing "
-                    "max_duration and trying again.\n"
-                    f"Failing criterion: {criterion} "
-                    f"(={crit_values[criterion]}) ..."
-                )
-            raise
-
-
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    world_size = args.world_size
-    assert world_size >= 1
-    if world_size > 1:
-        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
-    else:
-        run(rank=0, world_size=1, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/asr_datamodule.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/asr_datamodule.py
deleted file mode 120000
index b4e5427e0..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/asr_datamodule.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/beam_search.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/beam_search.py
deleted file mode 120000
index 227d2247c..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/beam_search.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/beam_search.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/decode.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/decode.py
deleted file mode 100755
index 47b4f9fd0..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/decode.py
+++ /dev/null
@@ -1,550 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-(1) greedy search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 100 \
-        --decoding-method greedy_search
-
-(2) beam search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 100 \
-        --decoding-method beam_search \
-        --beam-size 4
-
-(3) modified beam search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 100 \
-        --decoding-method modified_beam_search \
-        --beam-size 4
-
-(4) fast beam search
-./transducer_emformer/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./transducer_emformer/exp \
-        --max-duration 1500 \
-        --decoding-method fast_beam_search \
-        --beam 4 \
-        --max-contexts 4 \
-        --max-states 8
-"""
-
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
-from beam_search import (
-    beam_search,
-    fast_beam_search,
-    greedy_search,
-    greedy_search_batch,
-    modified_beam_search,
-)
-from train import add_model_arguments, get_params, get_transducer_model
-
-from icefall.checkpoint import (
-    average_checkpoints,
-    find_checkpoints,
-    load_checkpoint,
-)
-from icefall.utils import (
-    AttributeDict,
-    setup_logger,
-    store_transcripts,
-    write_error_stats,
-)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=28,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=15,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--avg-last-n",
-        type=int,
-        default=0,
-        help="""If positive, --epoch and --avg are ignored and it
-        will use the last n checkpoints exp_dir/checkpoint-xxx.pt
-        where xxx is the number of processed batches while
-        saving that checkpoint.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="transducer_emformer/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--decoding-method",
-        type=str,
-        default="greedy_search",
-        help="""Possible values are:
-          - greedy_search
-          - beam_search
-          - modified_beam_search
-          - fast_beam_search
-        """,
-    )
-
-    parser.add_argument(
-        "--beam-size",
-        type=int,
-        default=4,
-        help="""An interger indicating how many candidates we will keep for each
-        frame. Used only when --decoding-method is beam_search or
-        modified_beam_search.""",
-    )
-
-    parser.add_argument(
-        "--beam",
-        type=float,
-        default=4,
-        help="""A floating point value to calculate the cutoff score during beam
-        search (i.e., `cutoff = max-score - beam`), which is the same as the
-        `beam` in Kaldi.
-        Used only when --decoding-method is fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-contexts",
-        type=int,
-        default=4,
-        help="""Used only when --decoding-method is
-        fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-states",
-        type=int,
-        default=8,
-        help="""Used only when --decoding-method is
-        fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
-    )
-    parser.add_argument(
-        "--max-sym-per-frame",
-        type=int,
-        default=1,
-        help="""Maximum number of symbols per frame.
-        Used only when --decoding_method is greedy_search""",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    batch: dict,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[List[str]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if greedy_search is used, it would be "greedy_search"
-               If beam search with a beam size of 7 is used, it would be
-               "beam_7"
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    device = model.device
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-
-    feature = feature.to(device)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    encoder_out, encoder_out_lens = model.encoder(
-        x=feature, x_lens=feature_lens
-    )
-    hyps = []
-
-    if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
-            model=model,
-            decoding_graph=decoding_graph,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam,
-            max_contexts=params.max_contexts,
-            max_states=params.max_states,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif (
-        params.decoding_method == "greedy_search"
-        and params.max_sym_per_frame == 1
-    ):
-        hyp_tokens = greedy_search_batch(
-            model=model,
-            encoder_out=encoder_out,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif params.decoding_method == "modified_beam_search":
-        hyp_tokens = modified_beam_search(
-            model=model,
-            encoder_out=encoder_out,
-            beam=params.beam_size,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    else:
-        batch_size = encoder_out.size(0)
-
-        for i in range(batch_size):
-            # fmt: off
-            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-            # fmt: on
-            if params.decoding_method == "greedy_search":
-                hyp = greedy_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    max_sym_per_frame=params.max_sym_per_frame,
-                )
-            elif params.decoding_method == "beam_search":
-                hyp = beam_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    beam=params.beam_size,
-                )
-            else:
-                raise ValueError(
-                    f"Unsupported decoding method: {params.decoding_method}"
-                )
-            hyps.append(sp.decode(hyp).split())
-
-    if params.decoding_method == "greedy_search":
-        return {"greedy_search": hyps}
-    elif params.decoding_method == "fast_beam_search":
-        return {
-            (
-                f"beam_{params.beam}_"
-                f"max_contexts_{params.max_contexts}_"
-                f"max_states_{params.max_states}"
-            ): hyps
-        }
-    else:
-        return {f"beam_size_{params.beam_size}": hyps}
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search.
-    Returns:
-      Return a dict, whose key may be "greedy_search" if greedy search
-      is used, or it may be "beam_7" if beam size of 7 is used.
-      Its value is a list of tuples. Each tuple contains two elements:
-      The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    if params.decoding_method == "greedy_search":
-        log_interval = 100
-    else:
-        log_interval = 2
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            model=model,
-            sp=sp,
-            decoding_graph=decoding_graph,
-            batch=batch,
-        )
-
-        for name, hyps in hyps_dict.items():
-            this_batch = []
-            assert len(hyps) == len(texts)
-            for hyp_words, ref_text in zip(hyps, texts):
-                ref_words = ref_text.split()
-                this_batch.append((ref_words, hyp_words))
-
-            results[name].extend(this_batch)
-
-        num_cuts += len(texts)
-
-        if batch_idx % log_interval == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(
-                f"batch {batch_str}, cuts processed until now is {num_cuts}"
-            )
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
-):
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = (
-            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        store_transcripts(filename=recog_path, texts=results)
-        logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = (
-            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=True
-            )
-            test_set_wers[key] = wer
-
-        logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = (
-        params.res_dir
-        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
-    )
-    with open(errs_info, "w") as f:
-        print("settings\tWER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    assert params.decoding_method in (
-        "greedy_search",
-        "beam_search",
-        "fast_beam_search",
-        "modified_beam_search",
-    )
-    params.res_dir = params.exp_dir / params.decoding_method
-
-    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "fast_beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam}"
-        params.suffix += f"-max-contexts-{params.max_contexts}"
-        params.suffix += f"-max-states-{params.max_states}"
-    elif "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
-    else:
-        params.suffix += f"-context-{params.context_size}"
-        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
-
-    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"Device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_transducer_model(params)
-
-    if params.avg_last_n > 0:
-        filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n]
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-    elif params.avg == 1:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-
-    model.to(device)
-    model.eval()
-    model.device = device
-
-    if params.decoding_method == "fast_beam_search":
-        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
-    else:
-        decoding_graph = None
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    librispeech = LibriSpeechAsrDataModule(args)
-
-    test_clean_cuts = librispeech.test_clean_cuts()
-    test_other_cuts = librispeech.test_other_cuts()
-
-    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
-    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
-
-    test_sets = ["test-clean", "test-other"]
-    test_dl = [test_clean_dl, test_other_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dl):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            model=model,
-            sp=sp,
-            decoding_graph=decoding_graph,
-        )
-
-        save_results(
-            params=params,
-            test_set_name=test_set,
-            results_dict=results_dict,
-        )
-
-    logging.info("Done!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/decoder.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/decoder.py
deleted file mode 120000
index 0d5f10dc0..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/decoder.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/decoder.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/emformer.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/emformer.py
deleted file mode 100644
index 0f4aad163..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/emformer.py
+++ /dev/null
@@ -1,1632 +0,0 @@
-# Copyright      2022  Xiaomi Corporation     (Author: Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# It is modified based on
-# https://github.com/pytorch/audio/blob/main/torchaudio/models/emformer.py.
-
-import math
-import warnings
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-from encoder_interface import EncoderInterface
-from subsampling import Conv2dSubsampling, VggSubsampling
-
-from icefall.utils import make_pad_mask
-
-
-def _get_activation_module(activation: str) -> nn.Module:
-    if activation == "relu":
-        return nn.ReLU()
-    elif activation == "gelu":
-        return nn.GELU()
-    elif activation == "silu":
-        return nn.SiLU()
-    else:
-        raise ValueError(f"Unsupported activation {activation}")
-
-
-def _gen_attention_mask_block(
-    col_widths: List[int],
-    col_mask: List[bool],
-    num_rows: int,
-    device: torch.device,
-) -> torch.Tensor:
-    assert len(col_widths) == len(
-        col_mask
-    ), "Length of col_widths must match that of col_mask"
-
-    mask_block = [
-        torch.ones(num_rows, col_width, device=device)
-        if is_ones_col
-        else torch.zeros(num_rows, col_width, device=device)
-        for col_width, is_ones_col in zip(col_widths, col_mask)
-    ]
-    return torch.cat(mask_block, dim=1)
-
-
-def unstack_states(
-    states: List[List[torch.Tensor]],
-) -> List[List[List[torch.Tensor]]]:
-    """Unstack the emformer state corresponding to a batch of utterances
-    into a list of states, were the i-th entry is the state from the i-th
-    utterance in the batch.
-
-    Args:
-      states:
-        A list-of-list of tensors. ``len(states)`` equals to number of
-        layers in the emformer. ``states[i]]`` contains the states for
-        the i-th layer. ``states[i][k]`` is either a 3-D tensor of shape
-        ``(T, N, C)`` or a 2-D tensor of shape ``(C, N)``
-    """
-    batch_size = states[0][0].size(1)
-    num_layers = len(states)
-
-    ans = [None] * batch_size
-    for i in range(batch_size):
-        ans[i] = [[] for _ in range(num_layers)]
-
-    for li, layer in enumerate(states):
-        for s in layer:
-            s_list = s.unbind(dim=1)
-            # We will use stack(dim=1) later in stack_states()
-            for bi, b in enumerate(ans):
-                b[li].append(s_list[bi])
-    return ans
-
-
-def stack_states(
-    state_list: List[List[List[torch.Tensor]]],
-) -> List[List[torch.Tensor]]:
-    """Stack list of emformer states that correspond to separate utterances
-    into a single emformer state so that it can be used as an input for
-    emformer when those utterances are formed into a batch.
-
-    Note:
-      It is the inverse of :func:`unstack_states`.
-
-    Args:
-      state_list:
-        Each element in state_list corresponding to the internal state
-        of the emformer model for a single utterance.
-    Returns:
-      Return a new state corresponding to a batch of utterances.
-      See the input argument of :func:`unstack_states` for the meaning
-      of the returned tensor.
-    """
-    batch_size = len(state_list)
-    ans = []
-    for layer in state_list[0]:
-        # layer is a list of tensors
-        if batch_size > 1:
-            ans.append([[s] for s in layer])
-            # Note: We will stack ans[layer][s][] later to get ans[layer][s]
-        else:
-            ans.append([s.unsqueeze(1) for s in layer])
-
-    for b, states in enumerate(state_list[1:], 1):
-        for li, layer in enumerate(states):
-            for si, s in enumerate(layer):
-                ans[li][si].append(s)
-                if b == batch_size - 1:
-                    ans[li][si] = torch.stack(ans[li][si], dim=1)
-                    # We will use unbind(dim=1) later in unstack_states()
-    return ans
-
-
-class EmformerAttention(nn.Module):
-    r"""Emformer layer attention module.
-
-    Args:
-      embed_dim (int):
-        Embedding dimension.
-      nhead (int):
-        Number of attention heads in each Emformer layer.
-      dropout (float):
-        A Dropout layer on attn_output_weights. (Default: 0.0)
-      tanh_on_mem (bool, optional):
-        If ``True``, applies tanh to memory elements. (Default: ``False``)
-      negative_inf (float, optional):
-        Value to use for negative infinity in attention weights. (Default: -1e8)
-    """
-
-    def __init__(
-        self,
-        embed_dim: int,
-        nhead: int,
-        dropout: float = 0.0,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-    ):
-        super().__init__()
-
-        if embed_dim % nhead != 0:
-            raise ValueError(
-                f"embed_dim ({embed_dim}) is not a multiple of nhead ({nhead})."
-            )
-        self.embed_dim = embed_dim
-        self.nhead = nhead
-        self.tanh_on_mem = tanh_on_mem
-        self.negative_inf = negative_inf
-        self.head_dim = embed_dim // nhead
-
-        self.dropout = dropout
-
-        self.scaling = self.head_dim ** -0.5
-
-        self.emb_to_key_value = nn.Linear(embed_dim, 2 * embed_dim, bias=True)
-        self.emb_to_query = nn.Linear(embed_dim, embed_dim, bias=True)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
-
-        # linear transformation for positional encoding.
-        self.linear_pos = nn.Linear(embed_dim, embed_dim, bias=False)
-
-        # these two learnable bias are used in matrix c and matrix d
-        # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3  # noqa
-        self.pos_bias_u = nn.Parameter(torch.Tensor(nhead, self.head_dim))
-        self.pos_bias_v = nn.Parameter(torch.Tensor(nhead, self.head_dim))
-
-        self._reset_parameters()
-
-    def _reset_parameters(self) -> None:
-        nn.init.xavier_uniform_(self.emb_to_key_value.weight)
-        nn.init.constant_(self.emb_to_key_value.bias, 0.0)
-
-        nn.init.xavier_uniform_(self.emb_to_query.weight)
-        nn.init.constant_(self.emb_to_query.bias, 0.0)
-
-        nn.init.xavier_uniform_(self.out_proj.weight)
-        nn.init.constant_(self.out_proj.bias, 0.0)
-
-        nn.init.xavier_uniform_(self.linear_pos.weight)
-
-        nn.init.xavier_uniform_(self.pos_bias_u)
-        nn.init.xavier_uniform_(self.pos_bias_v)
-
-    def _gen_attention_probs(
-        self,
-        attention_weights: torch.Tensor,
-        attention_mask: torch.Tensor,
-        padding_mask: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """Given the entire attention weights, mask out unecessary connections
-        and optionally with padding positions, to obtain underlying chunk-wise
-        attention probabilities.
-
-        B: batch size;
-        Q: length of query;
-        KV: length of key and value.
-
-        Args:
-          attention_weights (torch.Tensor):
-            Attention weights computed on the entire concatenated tensor
-            with shape (B * nhead, Q, KV).
-          attention_mask (torch.Tensor):
-            Mask tensor where chunk-wise connections are filled with `False`,
-            and other unnecessary connections are filled with `True`,
-            with shape (Q, KV).
-          padding_mask (torch.Tensor, optional):
-            Mask tensor where the padding positions are fill with `True`,
-            and other positions are filled with `False`, with shapa `(B, KV)`.
-
-        Returns:
-          A tensor of shape (B * nhead, Q, KV).
-        """
-        attention_weights_float = attention_weights.float()
-        attention_weights_float = attention_weights_float.masked_fill(
-            attention_mask.unsqueeze(0), self.negative_inf
-        )
-        if padding_mask is not None:
-            Q = attention_weights.size(1)
-            B = attention_weights.size(0) // self.nhead
-            attention_weights_float = attention_weights_float.view(
-                B, self.nhead, Q, -1
-            )
-            attention_weights_float = attention_weights_float.masked_fill(
-                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
-                self.negative_inf,
-            )
-            attention_weights_float = attention_weights_float.view(
-                B * self.nhead, Q, -1
-            )
-
-        attention_probs = nn.functional.softmax(
-            attention_weights_float, dim=-1
-        ).type_as(attention_weights)
-
-        attention_probs = nn.functional.dropout(
-            attention_probs, p=self.dropout, training=self.training
-        )
-        return attention_probs
-
-    def _rel_shift(self, x: torch.Tensor) -> torch.Tensor:
-        """Compute relative positional encoding.
-
-        Args:
-          x: Input tensor, of shape (B, nhead, U, PE).
-             U is the length of query vector.
-             For non-infer mode, PE = 2 * U - 1;
-             for infer mode, PE = L + 2 * U - 1.
-
-        Returns:
-          A tensor of shape (B, nhead, U, out_len).
-          For non-infer mode, out_len = U;
-          for infer mode, out_len = L + U.
-        """
-        B, nhead, U, PE = x.size()
-        B_stride = x.stride(0)
-        nhead_stride = x.stride(1)
-        U_stride = x.stride(2)
-        PE_stride = x.stride(3)
-        out_len = PE - (U - 1)
-        return x.as_strided(
-            size=(B, nhead, U, out_len),
-            stride=(B_stride, nhead_stride, U_stride - PE_stride, PE_stride),
-            storage_offset=PE_stride * (U - 1),
-        )
-
-    def _forward_impl(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        summary: torch.Tensor,
-        memory: torch.Tensor,
-        attention_mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-        left_context_key: Optional[torch.Tensor] = None,
-        left_context_val: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Underlying chunk-wise attention implementation.
-
-        L: length of left_context;
-        S: length of summary;
-        M: length of memory;
-        Q: length of attention query;
-        KV: length of attention key and value.
-
-        1) Concat right_context, utterance, summary,
-        and compute query with length Q = R + U + S.
-        2) Concat memory, right_context, utterance,
-        and compute key, value with length KV = M + R + U;
-        also with left_context_key and left_context_val for infererence mode,
-        then KV = M + R + L + U.
-        3) Compute entire attention scores with above query, key, and value,
-        then apply attention_mask to get underlying chunk-wise attention scores.
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          summary (torch.Tensor):
-            Summary elements, with shape (S, B, D).
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          attention_mask (torch.Tensor):
-            Attention mask for underlying attention, with shape (Q, KV).
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D).
-            For training mode, PE = 2 * U - 1;
-            For inference mode, PE = L + 2 * U - 1.
-          left_context_key (torch,Tensor, optional):
-            Cached attention key of left context from preceding computation,
-            with shape (L, B, D). It is used for inference mode.
-          left_context_val (torch.Tensor, optional):
-            Cached attention value of left context from preceding computation,
-            with shape (L, B, D). It is used for inference mode.
-
-        Returns:
-          A tuple containing 4 tensors:
-            - output of right context and utterance, with shape (R + U, B, D).
-            - memory output, with shape (S, B, D).
-            - attention key, with shape (KV, B, D).
-            - attention value, with shape (KV, B, D).
-        """
-        U, B, _ = utterance.size()
-        R = right_context.size(0)
-        M = memory.size(0)
-
-        # compute query with [right context, utterance, summary].
-        query = self.emb_to_query(
-            torch.cat([right_context, utterance, summary])
-        )
-        # compute key and value with [mems, right context, utterance].
-        key, value = self.emb_to_key_value(
-            torch.cat([memory, right_context, utterance])
-        ).chunk(chunks=2, dim=2)
-
-        if left_context_key is not None and left_context_val is not None:
-            # compute key and value with
-            # [mems, right context, left context, uttrance]
-            key = torch.cat([key[: M + R], left_context_key, key[M + R :]])
-            value = torch.cat(
-                [value[: M + R], left_context_val, value[M + R :]]
-            )
-        Q = query.size(0)
-        KV = key.size(0)
-
-        reshaped_key, reshaped_value = [
-            tensor.contiguous()
-            .view(KV, B * self.nhead, self.head_dim)
-            .transpose(0, 1)
-            for tensor in [key, value]
-        ]  # both of shape (B * nhead, KV, head_dim)
-        reshaped_query = query.contiguous().view(
-            Q, B, self.nhead, self.head_dim
-        )
-
-        # compute attention score
-        # first compute attention matrix a and matrix c
-        # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3  # noqa
-        query_with_bais_u = (
-            (reshaped_query + self.pos_bias_u)
-            .view(Q, B * self.nhead, self.head_dim)
-            .transpose(0, 1)
-        )
-        matrix_ac = torch.bmm(
-            query_with_bais_u, reshaped_key.transpose(1, 2)
-        )  # (B * nhead, Q, KV)
-
-        # second, compute attention matrix b and matrix d
-        # relative positional encoding is applied on the part of attention
-        # between query: [utterance] -> key, value: [left_context, utterance]
-        utterance_with_bais_v = (
-            reshaped_query[R : R + U] + self.pos_bias_v
-        ).permute(1, 2, 0, 3)
-        # (B, nhead, U, head_dim)
-        PE = pos_emb.size(0)
-        if left_context_key is not None and left_context_val is not None:
-            L = left_context_key.size(0)
-            assert PE == L + 2 * U - 1
-        else:
-            assert PE == 2 * U - 1
-        pos_emb = (
-            self.linear_pos(pos_emb)
-            .view(PE, self.nhead, self.head_dim)
-            .transpose(0, 1)
-            .unsqueeze(0)
-        )  # (1, nhead, PE, head_dim)
-        matrix_bd_utterance = torch.matmul(
-            utterance_with_bais_v, pos_emb.transpose(-2, -1)
-        )  # (B, nhead, U, PE)
-        # rel-shift operation
-        matrix_bd_utterance = self._rel_shift(matrix_bd_utterance)
-        # (B, nhead, U, U) for training mode;
-        # (B, nhead, U, L + U) for inference mode.
-        matrix_bd_utterance = matrix_bd_utterance.contiguous().view(
-            B * self.nhead, U, -1
-        )
-        matrix_bd = torch.zeros_like(matrix_ac)
-        matrix_bd[:, R : R + U, M + R :] = matrix_bd_utterance
-
-        attention_weights = (matrix_ac + matrix_bd) * self.scaling
-
-        # compute padding mask
-        if B == 1:
-            padding_mask = None
-        else:
-            padding_mask = make_pad_mask(KV - U + lengths)
-
-        # compute attention probabilities
-        attention_probs = self._gen_attention_probs(
-            attention_weights, attention_mask, padding_mask
-        )
-
-        # compute attention outputs
-        attention = torch.bmm(attention_probs, reshaped_value)
-        assert attention.shape == (B * self.nhead, Q, self.head_dim)
-        attention = (
-            attention.transpose(0, 1).contiguous().view(Q, B, self.embed_dim)
-        )
-
-        # apply output projection
-        outputs = self.out_proj(attention)
-
-        output_right_context_utterance = outputs[: R + U]
-        output_memory = outputs[R + U :]
-        if self.tanh_on_mem:
-            output_memory = torch.tanh(output_memory)
-        else:
-            output_memory = torch.clamp(output_memory, min=-10, max=10)
-
-        return output_right_context_utterance, output_memory, key, value
-
-    def forward(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        summary: torch.Tensor,
-        memory: torch.Tensor,
-        attention_mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # TODO: Modify docs.
-        """Forward pass for training.
-
-        B: batch size;
-        D: embedding dimension;
-        R: length of right_context;
-        U: length of utterance;
-        S: length of summary;
-        M: length of memory.
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          summary (torch.Tensor):
-            Summary elements with shape (S, B, D) or an empty tensor.
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          attention_mask (torch.Tensor):
-            Attention mask for underlying chunk-wise attention,
-            with shape (Q, KV), where Q = R + U + S, KV = M + R + U.
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D).
-            where PE = 2 * U - 1.
-
-        Returns:
-          A tuple containing 2 tensors:
-            - output of right context and utterance, with shape (R + U, B, D).
-            - memory output, with shape (M, B, D), where M = S - 1 or M = 0.
-        """
-        (
-            output_right_context_utterance,
-            output_memory,
-            _,
-            _,
-        ) = self._forward_impl(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            attention_mask,
-            pos_emb,
-        )
-        return output_right_context_utterance, output_memory[:-1]
-
-    @torch.jit.export
-    def infer(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        summary: torch.Tensor,
-        memory: torch.Tensor,
-        left_context_key: torch.Tensor,
-        left_context_val: torch.Tensor,
-        pos_emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Forward pass for inference.
-
-        B: batch size;
-        D: embedding dimension;
-        R: length of right_context;
-        U: length of utterance;
-        L: length of left_context;
-        S: length of summary;
-        M: length of memory;
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          summary (torch.Tensor):
-            Summary element with shape (1, B, D), or an empty tensor.
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          left_context_key (torch,Tensor):
-            Cached attention key of left context from preceding computation,
-            with shape (L, B, D).
-          left_context_val (torch.Tensor):
-            Cached attention value of left context from preceding computation,
-            with shape (L, B, D).
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D),
-            where PE = L + 2 * U - 1.
-
-        Returns:
-          A tuple containing 4 tensors:
-            - output of right context and utterance, with shape (R + U, B, D).
-            - memory output, with shape (1, B, D) or (0, B, D).
-            - attention key of left context and utterance, which would be cached
-              for next computation, with shape (L + U, B, D).
-            - attention value of left context and utterance, which would be
-              cached for next computation, with shape (L + U, B, D).
-        """
-        U = utterance.size(0)
-        R = right_context.size(0)
-        L = left_context_key.size(0)
-        S = summary.size(0)
-        M = memory.size(0)
-
-        # query: [right context, utterance, summary]
-        Q = R + U + S
-        # key, value: [memory, right context, left context, uttrance]
-        KV = M + R + L + U
-        attention_mask = torch.zeros(Q, KV).to(
-            dtype=torch.bool, device=utterance.device
-        )
-        # disallow attention bettween the summary vector with the memory bank
-        attention_mask[-1, : memory.size(0)] = True
-        (
-            output_right_context_utterance,
-            output_memory,
-            key,
-            value,
-        ) = self._forward_impl(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            attention_mask,
-            pos_emb,
-            left_context_key=left_context_key,
-            left_context_val=left_context_val,
-        )
-        return (
-            output_right_context_utterance,
-            output_memory,
-            key[M + R :],
-            value[M + R :],
-        )
-
-
-class EmformerLayer(nn.Module):
-    """Emformer layer that constitutes Emformer.
-
-    Args:
-      d_model (int):
-        Input dimension.
-      nhead (int):
-        Number of attention heads.
-      dim_feedforward (int):
-        Hidden layer dimension of feedforward network.
-      chunk_length (int):
-        Length of each input segment.
-      dropout (float, optional):
-        Dropout probability. (Default: 0.0)
-      activation (str, optional):
-        Activation function to use in feedforward network.
-        Must be one of ("relu", "gelu", "silu"). (Default: "relu")
-      left_context_length (int, optional):
-        Length of left context. (Default: 0)
-      max_memory_size (int, optional):
-        Maximum number of memory elements to use. (Default: 0)
-        (Default: ``None``)
-      tanh_on_mem (bool, optional):
-        If ``True``, applies tanh to memory elements. (Default: ``False``)
-      negative_inf (float, optional):
-        Value to use for negative infinity in attention weights. (Default: -1e8)
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int,
-        chunk_length: int,
-        dropout: float = 0.0,
-        activation: str = "relu",
-        left_context_length: int = 0,
-        max_memory_size: int = 0,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-    ):
-        super().__init__()
-
-        self.attention = EmformerAttention(
-            embed_dim=d_model,
-            nhead=nhead,
-            dropout=dropout,
-            tanh_on_mem=tanh_on_mem,
-            negative_inf=negative_inf,
-        )
-        self.dropout = nn.Dropout(dropout)
-        self.summary_op = nn.AvgPool1d(
-            kernel_size=chunk_length, stride=chunk_length, ceil_mode=True
-        )
-
-        activation_module = _get_activation_module(activation)
-        self.pos_ff = nn.Sequential(
-            nn.LayerNorm(d_model),
-            nn.Linear(d_model, dim_feedforward),
-            activation_module,
-            nn.Dropout(dropout),
-            nn.Linear(dim_feedforward, d_model),
-            nn.Dropout(dropout),
-        )
-        self.layer_norm_input = nn.LayerNorm(d_model)
-        self.layer_norm_output = nn.LayerNorm(d_model)
-
-        self.left_context_length = left_context_length
-        self.chunk_length = chunk_length
-        self.max_memory_size = max_memory_size
-        self.d_model = d_model
-
-        self.use_memory = max_memory_size > 0
-
-    def _init_state(
-        self, batch_size: int, device: Optional[torch.device]
-    ) -> List[torch.Tensor]:
-        """Initialize states with zeros."""
-        empty_memory = torch.zeros(
-            self.max_memory_size, batch_size, self.d_model, device=device
-        )
-        left_context_key = torch.zeros(
-            self.left_context_length, batch_size, self.d_model, device=device
-        )
-        left_context_val = torch.zeros(
-            self.left_context_length, batch_size, self.d_model, device=device
-        )
-        past_length = torch.zeros(
-            1, batch_size, dtype=torch.int32, device=device
-        )
-        return [empty_memory, left_context_key, left_context_val, past_length]
-
-    def _unpack_state(
-        self, state: List[torch.Tensor]
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Unpack cached states including:
-        1) output memory from previous chunks in the lower layer;
-        2) attention key and value of left context from proceeding chunk's
-        computation.
-        """
-        past_length = state[3][0][0].item()
-        past_left_context_length = min(self.left_context_length, past_length)
-        past_memory_length = min(
-            self.max_memory_size, math.ceil(past_length / self.chunk_length)
-        )
-        memory_start_idx = self.max_memory_size - past_memory_length
-        pre_memory = state[0][memory_start_idx:]
-        left_context_start_idx = (
-            self.left_context_length - past_left_context_length
-        )
-        left_context_key = state[1][left_context_start_idx:]
-        left_context_val = state[2][left_context_start_idx:]
-        return pre_memory, left_context_key, left_context_val
-
-    def _pack_state(
-        self,
-        next_key: torch.Tensor,
-        next_val: torch.Tensor,
-        update_length: int,
-        memory: torch.Tensor,
-        state: List[torch.Tensor],
-    ) -> List[torch.Tensor]:
-        """Pack updated states including:
-        1) output memory of current chunk in the lower layer;
-        2) attention key and value in current chunk's computation, which would
-        be resued in next chunk's computation.
-        3) length of current chunk.
-        """
-        new_memory = torch.cat([state[0], memory])
-        new_key = torch.cat([state[1], next_key])
-        new_val = torch.cat([state[2], next_val])
-        memory_start_idx = new_memory.size(0) - self.max_memory_size
-        state[0] = new_memory[memory_start_idx:]
-        key_start_idx = new_key.size(0) - self.left_context_length
-        state[1] = new_key[key_start_idx:]
-        val_start_idx = new_val.size(0) - self.left_context_length
-        state[2] = new_val[val_start_idx:]
-        state[3] = state[3] + update_length
-        return state
-
-    def _apply_pre_attention_layer_norm(
-        self, utterance: torch.Tensor, right_context: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Apply layer normalization before attention."""
-        layer_norm_input = self.layer_norm_input(
-            torch.cat([right_context, utterance])
-        )
-        R = right_context.size(0)
-        layer_norm_utterance = layer_norm_input[R:]
-        layer_norm_right_context = layer_norm_input[:R]
-        return layer_norm_utterance, layer_norm_right_context
-
-    def _apply_post_attention_ffn_layer_norm(
-        self,
-        output_right_context_utterance: torch.Tensor,
-        utterance: torch.Tensor,
-        right_context: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Apply feed forward and layer normalization after attention."""
-        # apply residual connection between input and attention output.
-        result = self.dropout(output_right_context_utterance) + torch.cat(
-            [right_context, utterance]
-        )
-        # apply feedforward module and residual connection.
-        result = self.pos_ff(result) + result
-        # apply layer normalization for output.
-        result = self.layer_norm_output(result)
-
-        R = right_context.size(0)
-        output_utterance = result[R:]
-        output_right_context = result[:R]
-        return output_utterance, output_right_context
-
-    def _apply_attention_forward(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        memory: torch.Tensor,
-        attention_mask: Optional[torch.Tensor],
-        pos_emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Apply attention in non-infer mode."""
-        if attention_mask is None:
-            raise ValueError(
-                "attention_mask must be not None in non-infer mode. "
-            )
-        if self.use_memory:
-            summary = self.summary_op(utterance.permute(1, 2, 0)).permute(
-                2, 0, 1
-            )
-        else:
-            summary = torch.empty(0).to(
-                dtype=utterance.dtype, device=utterance.device
-            )
-        output_right_context_utterance, output_memory = self.attention(
-            utterance=utterance,
-            lengths=lengths,
-            right_context=right_context,
-            summary=summary,
-            memory=memory,
-            attention_mask=attention_mask,
-            pos_emb=pos_emb,
-        )
-        return output_right_context_utterance, output_memory
-
-    def _apply_attention_infer(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        memory: torch.Tensor,
-        pos_emb: torch.Tensor,
-        state: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
-        """Apply attention in infer mode.
-        1) Unpack cached states including:
-          - memory from previous chunks in the lower layer;
-          - attention key and value of left context from proceeding
-            chunk's compuation;
-        2) Apply attention computation;
-        3) Pack updated states including:
-          - output memory of current chunk in the lower layer;
-          - attention key and value in current chunk's computation, which would
-            be resued in next chunk's computation.
-          - length of current chunk.
-        """
-        if state is None:
-            state = self._init_state(utterance.size(1), device=utterance.device)
-        pre_memory, left_context_key, left_context_val = self._unpack_state(
-            state
-        )
-        if self.use_memory:
-            summary = self.summary_op(utterance.permute(1, 2, 0)).permute(
-                2, 0, 1
-            )
-            summary = summary[:1]
-        else:
-            summary = torch.empty(0).to(
-                dtype=utterance.dtype, device=utterance.device
-            )
-        # pos_emb is of shape [PE, D], where PE = L + 2 * U - 1,
-        # for query of [utterance] (i), key-value [left_context, utterance] (j),
-        # the max relative distance i - j is L + U - 1
-        # the min relative distance i - j is -(U - 1)
-        L = left_context_key.size(0)  # L <= left_context_length
-        U = utterance.size(0)
-        PE = L + 2 * U - 1
-        tot_PE = self.left_context_length + 2 * U - 1
-        assert pos_emb.size(0) == tot_PE
-        pos_emb = pos_emb[tot_PE - PE :]
-        (
-            output_right_context_utterance,
-            output_memory,
-            next_key,
-            next_val,
-        ) = self.attention.infer(
-            utterance=utterance,
-            lengths=lengths,
-            right_context=right_context,
-            summary=summary,
-            memory=pre_memory,
-            left_context_key=left_context_key,
-            left_context_val=left_context_val,
-            pos_emb=pos_emb,
-        )
-        state = self._pack_state(
-            next_key, next_val, utterance.size(0), memory, state
-        )
-        return output_right_context_utterance, output_memory, state
-
-    def forward(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        memory: torch.Tensor,
-        attention_mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        r"""Forward pass for training.
-        1) Apply layer normalization on input utterance and right context
-        before attention;
-        2) Apply attention module, compute updated utterance, right context,
-        and memory;
-        3) Apply feed forward module and layer normalization on output utterance
-        and right context.
-
-        B: batch size;
-        D: embedding dimension;
-        R: length of right_context;
-        U: length of utterance;
-        M: length of memory.
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          attention_mask (torch.Tensor):
-            Attention mask for underlying attention module,
-            with shape (Q, KV), where Q = R + U + S, KV = M + R + U.
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D),
-            where PE = 2 * U - 1.
-
-        Returns:
-          A tuple containing 3 tensors:
-            - output utterance, with shape (U, B, D).
-            - output right context, with shape (R, B, D).
-            - output memory, with shape (M, B, D).
-        """
-        (
-            layer_norm_utterance,
-            layer_norm_right_context,
-        ) = self._apply_pre_attention_layer_norm(utterance, right_context)
-        (
-            output_right_context_utterance,
-            output_memory,
-        ) = self._apply_attention_forward(
-            layer_norm_utterance,
-            lengths,
-            layer_norm_right_context,
-            memory,
-            attention_mask,
-            pos_emb,
-        )
-        (
-            output_utterance,
-            output_right_context,
-        ) = self._apply_post_attention_ffn_layer_norm(
-            output_right_context_utterance, utterance, right_context
-        )
-        return output_utterance, output_right_context, output_memory
-
-    @torch.jit.export
-    def infer(
-        self,
-        utterance: torch.Tensor,
-        lengths: torch.Tensor,
-        right_context: torch.Tensor,
-        memory: torch.Tensor,
-        pos_emb: torch.Tensor,
-        state: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]:
-        """Forward pass for inference.
-
-         1) Apply layer normalization on input utterance and right context
-         before attention;
-         2) Apply attention module with cached state, compute updated utterance,
-         right context, and memory, and update state;
-         3) Apply feed forward module and layer normalization on output
-         utterance and right context.
-
-         B: batch size;
-         D: embedding dimension;
-         R: length of right_context;
-         U: length of utterance;
-         M: length of memory.
-
-        Args:
-          utterance (torch.Tensor):
-            Utterance frames, with shape (U, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing
-            number of valid frames for i-th batch element in utterance.
-          right_context (torch.Tensor):
-            Right context frames, with shape (R, B, D).
-          memory (torch.Tensor):
-            Memory elements, with shape (M, B, D).
-          state (List[torch.Tensor], optional):
-            List of tensors representing layer internal state generated in
-            preceding computation. (default=None)
-          pos_emb (torch.Tensor):
-            Position encoding embedding, with shape (PE, D),
-            where PE = L + 2 * U - 1.
-
-         Returns:
-           (Tensor, Tensor, List[torch.Tensor], Tensor):
-             - output utterance, with shape (U, B, D);
-             - output right_context, with shape (R, B, D);
-             - output memory, with shape (1, B, D) or (0, B, D).
-             - output state.
-        """
-        (
-            layer_norm_utterance,
-            layer_norm_right_context,
-        ) = self._apply_pre_attention_layer_norm(utterance, right_context)
-        (
-            output_right_context_utterance,
-            output_memory,
-            output_state,
-        ) = self._apply_attention_infer(
-            layer_norm_utterance,
-            lengths,
-            layer_norm_right_context,
-            memory,
-            pos_emb,
-            state,
-        )
-        (
-            output_utterance,
-            output_right_context,
-        ) = self._apply_post_attention_ffn_layer_norm(
-            output_right_context_utterance, utterance, right_context
-        )
-        return (
-            output_utterance,
-            output_right_context,
-            output_memory,
-            output_state,
-        )
-
-
-class EmformerEncoder(nn.Module):
-    """Implements the Emformer architecture introduced in
-    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency
-    Streaming Speech Recognition*
-    [:footcite:`shi2021emformer`].
-
-    Args:
-      d_model (int):
-        Input dimension.
-      nhead (int):
-        Number of attention heads in each emformer layer.
-      dim_feedforward (int):
-        Hidden layer dimension of each emformer layer's feedforward network.
-      num_encoder_layers (int):
-        Number of emformer layers to instantiate.
-      chunk_length (int):
-        Length of each input segment.
-      dropout (float, optional):
-        Dropout probability. (default: 0.0)
-      activation (str, optional):
-        Activation function to use in each emformer layer's feedforward network.
-        Must be one of ("relu", "gelu", "silu"). (default: "relu")
-      left_context_length (int, optional):
-        Length of left context. (default: 0)
-      right_context_length (int, optional):
-        Length of right context. (default: 0)
-      max_memory_size (int, optional):
-        Maximum number of memory elements to use. (default: 0)
-      tanh_on_mem (bool, optional):
-        If ``true``, applies tanh to memory elements. (default: ``false``)
-      negative_inf (float, optional):
-        Value to use for negative infinity in attention weights. (default: -1e8)
-    """
-
-    def __init__(
-        self,
-        chunk_length: int,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        left_context_length: int = 0,
-        right_context_length: int = 0,
-        max_memory_size: int = 0,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-    ):
-        super().__init__()
-
-        self.use_memory = max_memory_size > 0
-        self.init_memory_op = nn.AvgPool1d(
-            kernel_size=chunk_length,
-            stride=chunk_length,
-            ceil_mode=True,
-        )
-
-        self.emformer_layers = nn.ModuleList(
-            [
-                EmformerLayer(
-                    d_model,
-                    nhead,
-                    dim_feedforward,
-                    chunk_length,
-                    dropout=dropout,
-                    activation=activation,
-                    left_context_length=left_context_length,
-                    max_memory_size=max_memory_size,
-                    tanh_on_mem=tanh_on_mem,
-                    negative_inf=negative_inf,
-                )
-                for layer_idx in range(num_encoder_layers)
-            ]
-        )
-
-        self.encoder_pos = RelPositionalEncoding(d_model, dropout)
-
-        self.left_context_length = left_context_length
-        self.right_context_length = right_context_length
-        self.chunk_length = chunk_length
-        self.max_memory_size = max_memory_size
-
-    def _gen_right_context(self, x: torch.Tensor) -> torch.Tensor:
-        """Hard copy each chunk's right context and concat them."""
-        T = x.shape[0]
-        num_chunks = math.ceil(
-            (T - self.right_context_length) / self.chunk_length
-        )
-        right_context_blocks = []
-        for seg_idx in range(num_chunks - 1):
-            start = (seg_idx + 1) * self.chunk_length
-            end = start + self.right_context_length
-            right_context_blocks.append(x[start:end])
-        right_context_blocks.append(x[T - self.right_context_length :])  # noqa
-        return torch.cat(right_context_blocks)
-
-    def _gen_attention_mask_col_widths(
-        self, chunk_idx: int, U: int
-    ) -> List[int]:
-        """Calculate column widths (key, value) in attention mask for the
-        chunk_idx chunk."""
-        num_chunks = math.ceil(U / self.chunk_length)
-        rc = self.right_context_length
-        lc = self.left_context_length
-        rc_start = chunk_idx * rc
-        rc_end = rc_start + rc
-        chunk_start = max(chunk_idx * self.chunk_length - lc, 0)
-        chunk_end = min((chunk_idx + 1) * self.chunk_length, U)
-        R = rc * num_chunks
-
-        if self.use_memory:
-            m_start = max(chunk_idx - self.max_memory_size, 0)
-            M = num_chunks - 1
-            col_widths = [
-                m_start,  # before memory
-                chunk_idx - m_start,  # memory
-                M - chunk_idx,  # after memory
-                rc_start,  # before right context
-                rc,  # right context
-                R - rc_end,  # after right context
-                chunk_start,  # before chunk
-                chunk_end - chunk_start,  # chunk
-                U - chunk_end,  # after chunk
-            ]
-        else:
-            col_widths = [
-                rc_start,  # before right context
-                rc,  # right context
-                R - rc_end,  # after right context
-                chunk_start,  # before chunk
-                chunk_end - chunk_start,  # chunk
-                U - chunk_end,  # after chunk
-            ]
-
-        return col_widths
-
-    def _gen_attention_mask(self, utterance: torch.Tensor) -> torch.Tensor:
-        """Generate attention mask for underlying chunk-wise attention
-        computation, where chunk-wise connections are filled with `False`,
-        and other unnecessary connections beyond chunk are filled with `True`.
-
-        R: length of right_context;
-        U: length of utterance;
-        S: length of summary;
-        M: length of memory;
-        Q: length of attention query;
-        KV: length of attention key and value.
-
-        The shape of attention mask is (Q, KV).
-        If self.use_memory is `True`:
-          query = [right_context, utterance, summary];
-          key, value = [memory, right_context, utterance];
-          Q = R + U + S, KV = M + R + U.
-        Otherwise:
-          query = [right_context, utterance]
-          key, value = [right_context, utterance]
-          Q = R + U, KV = R + U.
-        """
-        U = utterance.size(0)
-        num_chunks = math.ceil(U / self.chunk_length)
-
-        right_context_mask = []
-        utterance_mask = []
-        summary_mask = []
-
-        if self.use_memory:
-            num_cols = 9
-            # right context and utterance both attend to memory, right context,
-            # utterance
-            right_context_utterance_cols_mask = [
-                idx in [1, 4, 7] for idx in range(num_cols)
-            ]
-            # summary attends to right context, utterance
-            summary_cols_mask = [idx in [4, 7] for idx in range(num_cols)]
-            masks_to_concat = [right_context_mask, utterance_mask, summary_mask]
-        else:
-            num_cols = 6
-            # right context and utterance both attend to right context and
-            # utterance
-            right_context_utterance_cols_mask = [
-                idx in [1, 4] for idx in range(num_cols)
-            ]
-            summary_cols_mask = None
-            masks_to_concat = [right_context_mask, utterance_mask]
-
-        for chunk_idx in range(num_chunks):
-            col_widths = self._gen_attention_mask_col_widths(chunk_idx, U)
-
-            right_context_mask_block = _gen_attention_mask_block(
-                col_widths,
-                right_context_utterance_cols_mask,
-                self.right_context_length,
-                utterance.device,
-            )
-            right_context_mask.append(right_context_mask_block)
-
-            utterance_mask_block = _gen_attention_mask_block(
-                col_widths,
-                right_context_utterance_cols_mask,
-                min(
-                    self.chunk_length,
-                    U - chunk_idx * self.chunk_length,
-                ),
-                utterance.device,
-            )
-            utterance_mask.append(utterance_mask_block)
-
-            if summary_cols_mask is not None:
-                summary_mask_block = _gen_attention_mask_block(
-                    col_widths, summary_cols_mask, 1, utterance.device
-                )
-                summary_mask.append(summary_mask_block)
-
-        attention_mask = (
-            1 - torch.cat([torch.cat(mask) for mask in masks_to_concat])
-        ).to(torch.bool)
-        return attention_mask
-
-    def forward(
-        self, x: torch.Tensor, lengths: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Forward pass for training and non-streaming inference.
-
-        B: batch size;
-        D: input dimension;
-        U: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (U + right_context_length, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, which contains the
-            right_context at the end.
-
-        Returns:
-          A tuple of 2 tensors:
-            - output utterance frames, with shape (U, B, D).
-            - output_lengths, with shape (B,), without containing the
-              right_context at the end.
-        """
-        U = x.size(0) - self.right_context_length
-
-        # for query of [utterance] (i), key-value [utterance] (j),
-        # the max relative distance i - j is U - 1
-        # the min relative distance i - j is -(U - 1)
-        x, pos_emb = self.encoder_pos(x, pos_len=U, neg_len=U)
-
-        right_context = self._gen_right_context(x)
-        utterance = x[:U]
-        output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
-        attention_mask = self._gen_attention_mask(utterance)
-        memory = (
-            self.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[
-                :-1
-            ]
-            if self.use_memory
-            else torch.empty(0).to(dtype=x.dtype, device=x.device)
-        )
-
-        output = utterance
-        for layer in self.emformer_layers:
-            output, right_context, memory = layer(
-                output,
-                output_lengths,
-                right_context,
-                memory,
-                attention_mask,
-                pos_emb,
-            )
-
-        return output, output_lengths
-
-    @torch.jit.export
-    def infer(
-        self,
-        x: torch.Tensor,
-        lengths: torch.Tensor,
-        states: Optional[List[List[torch.Tensor]]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
-        """Forward pass for streaming inference.
-
-        B: batch size;
-        D: input dimension;
-        U: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (U + right_context_length, B, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, which contains the
-            right_context at the end.
-          states (List[List[torch.Tensor]], optional):
-            Cached states from proceeding chunk's computation, where each
-            element (List[torch.Tensor]) corresponding to each emformer layer.
-            (default: None)
-
-        Returns:
-          (Tensor, Tensor, List[List[torch.Tensor]]):
-            - output utterance frames, with shape (U, B, D).
-            - output lengths, with shape (B,), without containing the
-              right_context at the end.
-            - updated states from current chunk's computation.
-        """
-        assert x.size(0) == self.chunk_length + self.right_context_length, (
-            "Per configured chunk_length and right_context_length, "
-            f"expected size of {self.chunk_length + self.right_context_length} "
-            f"for dimension 1 of x, but got {x.size(1)}."
-        )
-
-        pos_len = self.chunk_length + self.left_context_length
-        neg_len = self.chunk_length
-        # for query of [utterance] (i), key-value [left_context, utterance] (j),
-        # the max relative distance i - j is L + U - 1
-        # the min relative distance i - j is -(U - 1)
-        x, pos_emb = self.encoder_pos(x, pos_len=pos_len, neg_len=neg_len)
-
-        right_context = x[self.chunk_length :]
-        utterance = x[: self.chunk_length]
-        output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
-        memory = (
-            self.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
-            if self.use_memory
-            else torch.empty(0).to(dtype=x.dtype, device=x.device)
-        )
-        output = utterance
-        output_states: List[List[torch.Tensor]] = []
-        for layer_idx, layer in enumerate(self.emformer_layers):
-            output, right_context, memory, output_state = layer.infer(
-                output,
-                output_lengths,
-                right_context,
-                memory,
-                pos_emb,
-                None if states is None else states[layer_idx],
-            )
-            output_states.append(output_state)
-
-        return output, output_lengths, output_states
-
-
-class Emformer(EncoderInterface):
-    def __init__(
-        self,
-        num_features: int,
-        output_dim: int,
-        chunk_length: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        dropout: float = 0.1,
-        vgg_frontend: bool = False,
-        activation: str = "relu",
-        left_context_length: int = 0,
-        right_context_length: int = 0,
-        max_memory_size: int = 0,
-        tanh_on_mem: bool = False,
-        negative_inf: float = -1e8,
-    ):
-        super().__init__()
-
-        self.subsampling_factor = subsampling_factor
-        self.right_context_length = right_context_length
-        self.chunk_length = chunk_length
-        self.left_context_length = left_context_length
-        if subsampling_factor != 4:
-            raise NotImplementedError("Support only 'subsampling_factor=4'.")
-        if chunk_length % 4 != 0:
-            raise NotImplementedError("chunk_length must be a mutiple of 4.")
-        if left_context_length != 0 and left_context_length % 4 != 0:
-            raise NotImplementedError(
-                "left_context_length must be 0 or a mutiple of 4."
-            )
-        if right_context_length != 0 and right_context_length % 4 != 0:
-            raise NotImplementedError(
-                "right_context_length must be 0 or a mutiple of 4."
-            )
-
-        # self.encoder_embed converts the input of shape (N, T, num_features)
-        # to the shape (N, T//subsampling_factor, d_model).
-        # That is, it does two things simultaneously:
-        #   (1) subsampling: T -> T//subsampling_factor
-        #   (2) embedding: num_features -> d_model
-        if vgg_frontend:
-            self.encoder_embed = VggSubsampling(num_features, d_model)
-        else:
-            self.encoder_embed = Conv2dSubsampling(num_features, d_model)
-
-        self.encoder = EmformerEncoder(
-            chunk_length // 4,
-            d_model,
-            nhead,
-            dim_feedforward,
-            num_encoder_layers,
-            dropout,
-            activation,
-            left_context_length=left_context_length // 4,
-            right_context_length=right_context_length // 4,
-            max_memory_size=max_memory_size,
-            tanh_on_mem=tanh_on_mem,
-            negative_inf=negative_inf,
-        )
-
-        # TODO(fangjun): remove dropout
-        self.encoder_output_layer = nn.Sequential(
-            nn.Dropout(p=dropout), nn.Linear(d_model, output_dim)
-        )
-
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Forward pass for training and non-streaming inference.
-
-        B: batch size;
-        D: feature dimension;
-        T: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (B, T, D).
-          x_lens (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, containing the
-            right_context at the end.
-
-        Returns:
-          (Tensor, Tensor):
-            - output logits, with shape (B, T', D), where
-              T' = ((T - 1) // 2 - 1) // 2 - self.right_context_length // 4.
-            - logits lengths, with shape (B,), without containing the
-              right_context at the end.
-        """
-        x = self.encoder_embed(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-
-        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            x_lens = ((x_lens - 1) // 2 - 1) // 2
-        assert x.size(0) == x_lens.max().item()
-
-        output, output_lengths = self.encoder(x, x_lens)  # (T, N, C)
-
-        logits = self.encoder_output_layer(output)
-        logits = logits.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
-
-        return logits, output_lengths
-
-    @torch.jit.export
-    def infer(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        states: Optional[List[List[torch.Tensor]]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
-        """Forward pass for streaming inference.
-
-        B: batch size;
-        D: feature dimension;
-        T: length of utterance.
-
-        Args:
-          x (torch.Tensor):
-            Utterance frames right-padded with right context frames,
-            with shape (B, T, D).
-          lengths (torch.Tensor):
-            With shape (B,) and i-th element representing number of valid
-            utterance frames for i-th batch element in x, containing the
-            right_context at the end.
-          states (List[List[torch.Tensor]], optional):
-            Cached states from proceeding chunk's computation, where each
-            element (List[torch.Tensor]) corresponding to each emformer layer.
-            (default: None)
-        Returns:
-          (Tensor, Tensor):
-            - output logits, with shape (B, T', D), where
-              T' = ((T - 1) // 2 - 1) // 2 - self.right_context_length // 4.
-            - logits lengths, with shape (B,), without containing the
-              right_context at the end.
-            - updated states from current chunk's computation.
-        """
-        x = self.encoder_embed(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-
-        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            x_lens = ((x_lens - 1) // 2 - 1) // 2
-        assert x.size(0) == x_lens.max().item()
-
-        output, output_lengths, output_states = self.encoder.infer(
-            x, x_lens, states
-        )  # (T, N, C)
-
-        logits = self.encoder_output_layer(output)
-        logits = logits.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
-
-        return logits, output_lengths, output_states
-
-
-class RelPositionalEncoding(torch.nn.Module):
-    """Relative positional encoding module.
-
-    See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"  # noqa
-    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py  # noqa
-
-    Args:
-        d_model: Embedding dimension.
-        dropout_rate: Dropout rate.
-        max_len: Maximum input length.
-
-    """
-
-    def __init__(
-        self, d_model: int, dropout_rate: float, max_len: int = 5000
-    ) -> None:
-        """Construct an PositionalEncoding object."""
-        super(RelPositionalEncoding, self).__init__()
-        self.d_model = d_model
-        self.xscale = math.sqrt(self.d_model)
-        self.dropout = torch.nn.Dropout(p=dropout_rate)
-        self.pe = None
-        self.pos_len = max_len
-        self.neg_len = max_len
-        self.gen_pe()
-
-    def gen_pe(self) -> None:
-        """Generate the positional encodings."""
-        # Suppose `i` means to the position of query vecotr and `j` means the
-        # position of key vector. We use position relative positions when keys
-        # are to the left (i>j) and negative relative positions otherwise (i<j).
-        pe_positive = torch.zeros(self.pos_len, self.d_model)
-        pe_negative = torch.zeros(self.neg_len, self.d_model)
-        position_positive = torch.arange(
-            0, self.pos_len, dtype=torch.float32
-        ).unsqueeze(1)
-        position_negative = torch.arange(
-            0, self.neg_len, dtype=torch.float32
-        ).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.d_model)
-        )
-        pe_positive[:, 0::2] = torch.sin(position_positive * div_term)
-        pe_positive[:, 1::2] = torch.cos(position_positive * div_term)
-        pe_negative[:, 0::2] = torch.sin(-1 * position_negative * div_term)
-        pe_negative[:, 1::2] = torch.cos(-1 * position_negative * div_term)
-
-        # Reserve the order of positive indices and concat both positive and
-        # negative indices. This is used to support the shifting trick
-        # as in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"  # noqa
-        self.pe_positive = torch.flip(pe_positive, [0])
-        self.pe_negative = pe_negative
-        # self.pe = torch.cat([pe_positive, pe_negative], dim=1)
-
-    def get_pe(
-        self,
-        pos_len: int,
-        neg_len: int,
-        device: torch.device,
-        dtype: torch.dtype,
-    ) -> torch.Tensor:
-        """Get positional encoding given positive length and negative length."""
-        if self.pe_positive.dtype != dtype or str(
-            self.pe_positive.device
-        ) != str(device):
-            self.pe_positive = self.pe_positive.to(dtype=dtype, device=device)
-        if self.pe_negative.dtype != dtype or str(
-            self.pe_negative.device
-        ) != str(device):
-            self.pe_negative = self.pe_negative.to(dtype=dtype, device=device)
-        pe = torch.cat(
-            [
-                self.pe_positive[self.pos_len - pos_len :],
-                self.pe_negative[1:neg_len],
-            ],
-            dim=0,
-        )
-        return pe
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        pos_len: int,
-        neg_len: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Add positional encoding.
-
-        Args:
-            x (torch.Tensor): Input tensor (batch, time, `*`).
-
-        Returns:
-            torch.Tensor: Encoded tensor (batch, time, `*`).
-            torch.Tensor: Encoded tensor (batch, 2*time-1, `*`).
-
-        """
-        x = x * self.xscale
-        if pos_len > self.pos_len or neg_len > self.neg_len:
-            self.pos_len = pos_len
-            self.neg_len = neg_len
-            self.gen_pe()
-        pos_emb = self.get_pe(pos_len, neg_len, x.device, x.dtype)
-        return self.dropout(x), self.dropout(pos_emb)
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/encoder_interface.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/encoder_interface.py
deleted file mode 120000
index aa5d0217a..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/encoder_interface.py
+++ /dev/null
@@ -1 +0,0 @@
-../transducer_stateless/encoder_interface.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/joiner.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/joiner.py
deleted file mode 120000
index 81ad47c55..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/joiner.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/joiner.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/model.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/model.py
deleted file mode 120000
index a61a0a23f..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/model.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless/model.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/noam.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/noam.py
deleted file mode 100644
index e46bf35fb..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/noam.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright    2021 University of Chinese Academy of Sciences (author: Han Zhu)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-
-class Noam(object):
-    """
-    Implements Noam optimizer.
-
-    Proposed in
-    "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
-
-    Modified from
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py  # noqa
-
-    Args:
-      params:
-        iterable of parameters to optimize or dicts defining parameter groups
-      model_size:
-        attention dimension of the transformer model
-      factor:
-        learning rate factor
-      warm_step:
-        warmup steps
-    """
-
-    def __init__(
-        self,
-        params,
-        model_size: int = 256,
-        factor: float = 10.0,
-        warm_step: int = 25000,
-        weight_decay=0,
-    ) -> None:
-        """Construct an Noam object."""
-        self.optimizer = torch.optim.Adam(
-            params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay
-        )
-        self._step = 0
-        self.warmup = warm_step
-        self.factor = factor
-        self.model_size = model_size
-        self._rate = 0
-
-    @property
-    def param_groups(self):
-        """Return param_groups."""
-        return self.optimizer.param_groups
-
-    def step(self):
-        """Update parameters and rate."""
-        self._step += 1
-        rate = self.rate()
-        for p in self.optimizer.param_groups:
-            p["lr"] = rate
-        self._rate = rate
-        self.optimizer.step()
-
-    def rate(self, step=None):
-        """Implement `lrate` above."""
-        if step is None:
-            step = self._step
-        return (
-            self.factor
-            * self.model_size ** (-0.5)
-            * min(step ** (-0.5), step * self.warmup ** (-1.5))
-        )
-
-    def zero_grad(self):
-        """Reset gradient."""
-        self.optimizer.zero_grad()
-
-    def state_dict(self):
-        """Return state_dict."""
-        return {
-            "_step": self._step,
-            "warmup": self.warmup,
-            "factor": self.factor,
-            "model_size": self.model_size,
-            "_rate": self._rate,
-            "optimizer": self.optimizer.state_dict(),
-        }
-
-    def load_state_dict(self, state_dict):
-        """Load state_dict."""
-        for key, value in state_dict.items():
-            if key == "optimizer":
-                self.optimizer.load_state_dict(state_dict["optimizer"])
-            else:
-                setattr(self, key, value)
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/subsampling.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/subsampling.py
deleted file mode 120000
index 6fee09e58..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/subsampling.py
+++ /dev/null
@@ -1 +0,0 @@
-../conformer_ctc/subsampling.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/test_emformer.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/test_emformer.py
deleted file mode 100644
index b2b1000cc..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/test_emformer.py
+++ /dev/null
@@ -1,804 +0,0 @@
-import torch
-
-
-def test_emformer_attention_forward():
-    from emformer import EmformerAttention
-
-    B, D = 2, 256
-    chunk_length = 4
-    right_context_length = 2
-    num_chunks = 3
-    U = num_chunks * chunk_length
-    R = num_chunks * right_context_length
-    attention = EmformerAttention(embed_dim=D, nhead=8)
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S = num_chunks
-            M = S - 1
-        else:
-            S, M = 0, 0
-
-        Q, KV = R + U + S, M + R + U
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        summary = torch.randn(S, B, D)
-        memory = torch.randn(M, B, D)
-        attention_mask = torch.rand(Q, KV) >= 0.5
-        PE = 2 * U - 1
-        pos_emb = torch.randn(PE, D)
-
-        output_right_context_utterance, output_memory = attention(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            attention_mask,
-            pos_emb,
-        )
-        assert output_right_context_utterance.shape == (R + U, B, D)
-        assert output_memory.shape == (M, B, D)
-
-
-def test_emformer_attention_infer():
-    from emformer import EmformerAttention
-
-    B, D = 2, 256
-    U = 4
-    R = 2
-    L = 3
-    attention = EmformerAttention(embed_dim=D, nhead=8)
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S, M = 1, 3
-        else:
-            S, M = 0, 0
-
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        summary = torch.randn(S, B, D)
-        memory = torch.randn(M, B, D)
-        left_context_key = torch.randn(L, B, D)
-        left_context_val = torch.randn(L, B, D)
-        PE = L + 2 * U - 1
-        pos_emb = torch.randn(PE, D)
-
-        (
-            output_right_context_utterance,
-            output_memory,
-            next_key,
-            next_val,
-        ) = attention.infer(
-            utterance,
-            lengths,
-            right_context,
-            summary,
-            memory,
-            left_context_key,
-            left_context_val,
-            pos_emb,
-        )
-        assert output_right_context_utterance.shape == (R + U, B, D)
-        assert output_memory.shape == (S, B, D)
-        assert next_key.shape == (L + U, B, D)
-        assert next_val.shape == (L + U, B, D)
-
-
-def test_emformer_layer_forward():
-    from emformer import EmformerLayer
-
-    B, D = 2, 256
-    chunk_length = 4
-    right_context_length = 2
-    left_context_length = 2
-    num_chunks = 3
-    U = num_chunks * chunk_length
-    R = num_chunks * right_context_length
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S = num_chunks
-            M = S - 1
-        else:
-            S, M = 0, 0
-
-        layer = EmformerLayer(
-            d_model=D,
-            nhead=8,
-            dim_feedforward=1024,
-            chunk_length=chunk_length,
-            left_context_length=left_context_length,
-            max_memory_size=M,
-        )
-
-        Q, KV = R + U + S, M + R + U
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        memory = torch.randn(M, B, D)
-        attention_mask = torch.rand(Q, KV) >= 0.5
-        PE = 2 * U - 1
-        pos_emb = torch.randn(PE, D)
-
-        output_utterance, output_right_context, output_memory = layer(
-            utterance, lengths, right_context, memory, attention_mask, pos_emb
-        )
-        assert output_utterance.shape == (U, B, D)
-        assert output_right_context.shape == (R, B, D)
-        assert output_memory.shape == (M, B, D)
-
-
-def test_emformer_layer_infer():
-    from emformer import EmformerLayer
-
-    B, D = 2, 256
-    U = 4
-    R = 2
-    L = 3
-
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-
-        layer = EmformerLayer(
-            d_model=D,
-            nhead=8,
-            dim_feedforward=1024,
-            chunk_length=U,
-            left_context_length=L,
-            max_memory_size=M,
-        )
-
-        utterance = torch.randn(U, B, D)
-        lengths = torch.randint(1, U + 1, (B,))
-        lengths[0] = U
-        right_context = torch.randn(R, B, D)
-        memory = torch.randn(M, B, D)
-        state = None
-        PE = L + 2 * U - 1
-        pos_emb = torch.randn(PE, D)
-        (
-            output_utterance,
-            output_right_context,
-            output_memory,
-            output_state,
-        ) = layer.infer(
-            utterance,
-            lengths,
-            right_context,
-            memory,
-            pos_emb,
-            state,
-        )
-        assert output_utterance.shape == (U, B, D)
-        assert output_right_context.shape == (R, B, D)
-        if use_memory:
-            assert output_memory.shape == (1, B, D)
-        else:
-            assert output_memory.shape == (0, B, D)
-        assert len(output_state) == 4
-        assert output_state[0].shape == (M, B, D)
-        assert output_state[1].shape == (L, B, D)
-        assert output_state[2].shape == (L, B, D)
-        assert output_state[3].shape == (1, B)
-
-
-def test_emformer_encoder_forward():
-    from emformer import EmformerEncoder
-
-    B, D = 2, 256
-    chunk_length = 4
-    right_context_length = 2
-    left_context_length = 2
-    num_chunks = 3
-    U = num_chunks * chunk_length
-
-    for use_memory in [True, False]:
-        if use_memory:
-            S = num_chunks
-            M = S - 1
-        else:
-            S, M = 0, 0
-
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=2,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-        )
-
-        x = torch.randn(U + right_context_length, B, D)
-        lengths = torch.randint(1, U + right_context_length + 1, (B,))
-        lengths[0] = U + right_context_length
-
-        output, output_lengths = encoder(x, lengths)
-        assert output.shape == (U, B, D)
-        assert torch.equal(
-            output_lengths, torch.clamp(lengths - right_context_length, min=0)
-        )
-
-
-def test_emformer_encoder_infer():
-    from emformer import EmformerEncoder
-
-    B, D = 2, 256
-    num_encoder_layers = 2
-    chunk_length = 4
-    right_context_length = 2
-    left_context_length = 2
-    num_chunks = 3
-
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-        )
-
-        states = None
-        for chunk_idx in range(num_chunks):
-            x = torch.randn(chunk_length + right_context_length, B, D)
-            lengths = torch.randint(
-                1, chunk_length + right_context_length + 1, (B,)
-            )
-            lengths[0] = chunk_length + right_context_length
-            output, output_lengths, states = encoder.infer(x, lengths, states)
-            assert output.shape == (chunk_length, B, D)
-            assert torch.equal(
-                output_lengths,
-                torch.clamp(lengths - right_context_length, min=0),
-            )
-            assert len(states) == num_encoder_layers
-            for state in states:
-                assert len(state) == 4
-                assert state[0].shape == (M, B, D)
-                assert state[1].shape == (left_context_length, B, D)
-                assert state[2].shape == (left_context_length, B, D)
-                assert torch.equal(
-                    state[3],
-                    (chunk_idx + 1) * chunk_length * torch.ones_like(state[3]),
-                )
-
-
-def test_emformer_forward():
-    from emformer import Emformer
-
-    num_features = 80
-    chunk_length = 16
-    right_context_length = 8
-    left_context_length = 8
-    num_chunks = 3
-    U = num_chunks * chunk_length
-    output_dim = 1000
-    B, D = 2, 256
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-        model = Emformer(
-            num_features=num_features,
-            output_dim=output_dim,
-            chunk_length=chunk_length,
-            subsampling_factor=4,
-            d_model=D,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-            vgg_frontend=False,
-        )
-        x = torch.randn(B, U + right_context_length + 3, num_features)
-        x_lens = torch.randint(1, U + right_context_length + 3 + 1, (B,))
-        x_lens[0] = U + right_context_length + 3
-        logits, output_lengths = model(x, x_lens)
-        assert logits.shape == (B, U // 4, output_dim)
-        assert torch.equal(
-            output_lengths,
-            torch.clamp(
-                ((x_lens - 1) // 2 - 1) // 2 - right_context_length // 4, min=0
-            ),
-        )
-
-
-def test_emformer_infer():
-    from emformer import Emformer
-
-    num_features = 80
-    output_dim = 1000
-    chunk_length = 8
-    U = chunk_length
-    left_context_length, right_context_length = 128, 4
-    B, D = 2, 256
-    num_chunks = 3
-    num_encoder_layers = 2
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-        model = Emformer(
-            num_features=num_features,
-            output_dim=output_dim,
-            chunk_length=chunk_length,
-            subsampling_factor=4,
-            d_model=D,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-            vgg_frontend=False,
-        )
-        states = None
-        for chunk_idx in range(num_chunks):
-            x = torch.randn(B, U + right_context_length + 3, num_features)
-            x_lens = torch.randint(1, U + right_context_length + 3 + 1, (B,))
-            x_lens[0] = U + right_context_length + 3
-            logits, output_lengths, states = model.infer(x, x_lens, states)
-            assert logits.shape == (B, U // 4, output_dim)
-            assert torch.equal(
-                output_lengths,
-                torch.clamp(
-                    ((x_lens - 1) // 2 - 1) // 2 - right_context_length // 4,
-                    min=0,
-                ),
-            )
-            assert len(states) == num_encoder_layers
-            for state in states:
-                assert len(state) == 4
-                assert state[0].shape == (M, B, D)
-                assert state[1].shape == (left_context_length // 4, B, D)
-                assert state[2].shape == (left_context_length // 4, B, D)
-                assert torch.equal(
-                    state[3],
-                    U // 4 * (chunk_idx + 1) * torch.ones_like(state[3]),
-                )
-
-
-def test_emformer_attention_forward_infer_consistency():
-    # TODO: delete
-    from emformer import EmformerEncoder
-
-    chunk_length = 4
-    num_chunks = 3
-    U = chunk_length * num_chunks
-    L, R = 1, 2
-    D = 256
-    num_encoder_layers = 1
-    memory_sizes = [0, 3]
-
-    for M in memory_sizes:
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=L,
-            right_context_length=R,
-            max_memory_size=M,
-            dropout=0.1,
-        )
-        encoder.eval()
-        encoder_layer = encoder.emformer_layers[0]
-
-        x = torch.randn(U + R, 1, D)
-        lengths = torch.tensor([U])
-        right_context = encoder._gen_right_context(x)
-        utterance = x[: x.size(0) - R]
-        attention_mask = encoder._gen_attention_mask(utterance)
-        memory = (
-            encoder.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[
-                :-1
-            ]
-            if encoder.use_memory
-            else torch.empty(0).to(dtype=x.dtype, device=x.device)
-        )
-        (
-            forward_output_right_context_utterance,
-            forward_output_memory,
-        ) = encoder_layer._apply_attention_forward(
-            utterance,
-            lengths,
-            right_context,
-            memory,
-            attention_mask,
-        )
-        forward_output_utterance = forward_output_right_context_utterance[
-            right_context.size(0) :  # noqa
-        ]
-
-        state = None
-        for chunk_idx in range(num_chunks):
-            start_idx = chunk_idx * chunk_length
-            end_idx = start_idx + chunk_length
-            chunk = x[start_idx:end_idx]
-            chunk_right_context = x[end_idx : end_idx + R]  # noqa
-            chunk_length = torch.tensor([chunk_length])
-            chunk_memory = (
-                encoder.init_memory_op(chunk.permute(1, 2, 0)).permute(2, 0, 1)
-                if encoder.use_memory
-                else torch.empty(0).to(dtype=x.dtype, device=x.device)
-            )
-            (
-                infer_output_right_context_utterance,
-                infer_output_memory,
-                state,
-            ) = encoder_layer._apply_attention_infer(
-                chunk,
-                chunk_length,
-                chunk_right_context,
-                chunk_memory,
-                state,
-            )
-            infer_output_chunk = infer_output_right_context_utterance[
-                chunk_right_context.size(0) :  # noqa
-            ]
-            forward_output_chunk = forward_output_utterance[start_idx:end_idx]
-            assert torch.allclose(
-                infer_output_chunk,
-                forward_output_chunk,
-                atol=1e-6,
-                rtol=0.0,
-            )
-
-
-def test_emformer_layer_forward_infer_consistency():
-    from emformer import EmformerEncoder
-
-    chunk_length = 4
-    num_chunks = 3
-    U = chunk_length * num_chunks
-    left_context_length, right_context_length = 1, 2
-    D = 256
-    num_encoder_layers = 1
-    memory_sizes = [0, 3]
-
-    for M in memory_sizes:
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-            dropout=0.1,
-        )
-        encoder.eval()
-        encoder_layer = encoder.emformer_layers[0]
-        encoder_pos = encoder.encoder_pos
-
-        x = torch.randn(U + right_context_length, 1, D)
-
-        # training mode with full utterance
-        x_forward, pos_emb = encoder_pos(x, U, U)
-        lengths = torch.tensor([U])
-        right_context = encoder._gen_right_context(x_forward)
-        utterance = x_forward[:U]
-        attention_mask = encoder._gen_attention_mask(utterance)
-        memory = (
-            encoder.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[
-                :-1
-            ]
-            if encoder.use_memory
-            else torch.empty(0).to(dtype=x.dtype, device=x.device)
-        )
-        (
-            forward_output_utterance,
-            forward_output_right_context,
-            forward_output_memory,
-        ) = encoder_layer(
-            utterance,
-            lengths,
-            right_context,
-            memory,
-            attention_mask,
-            pos_emb,
-        )
-
-        state = None
-        for chunk_idx in range(num_chunks):
-            start_idx = chunk_idx * chunk_length
-            end_idx = start_idx + chunk_length
-            cur_x, pos_emb = encoder_pos(
-                x[start_idx : end_idx + right_context_length],
-                pos_len=chunk_length + left_context_length,
-                neg_len=chunk_length,
-            )
-            chunk = cur_x[:chunk_length]
-            chunk_right_context = cur_x[chunk_length:]
-            chunk_memory = (
-                encoder.init_memory_op(chunk.permute(1, 2, 0)).permute(2, 0, 1)
-                if encoder.use_memory
-                else torch.empty(0).to(dtype=x.dtype, device=x.device)
-            )
-            (
-                infer_output_chunk,
-                infer_right_context,
-                infer_output_memory,
-                state,
-            ) = encoder_layer.infer(
-                chunk,
-                torch.tensor([chunk_length]),
-                chunk_right_context,
-                chunk_memory,
-                pos_emb,
-                state,
-            )
-            forward_output_chunk = forward_output_utterance[start_idx:end_idx]
-            assert torch.allclose(
-                infer_output_chunk,
-                forward_output_chunk,
-                atol=1e-5,
-                rtol=0.0,
-            )
-
-
-def test_emformer_encoder_forward_infer_consistency():
-    from emformer import EmformerEncoder
-
-    chunk_length = 4
-    num_chunks = 3
-    U = chunk_length * num_chunks
-    left_context_length, right_context_length = 1, 2
-    D = 256
-    num_encoder_layers = 3
-    memory_sizes = [0, 3]
-
-    for M in memory_sizes:
-        encoder = EmformerEncoder(
-            chunk_length=chunk_length,
-            d_model=D,
-            dim_feedforward=1024,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-            dropout=0.1,
-        )
-        encoder.eval()
-
-        x = torch.randn(U + right_context_length, 1, D)
-        lengths = torch.tensor([U + right_context_length])
-
-        # training mode with full utterance
-        forward_output, forward_output_lengths = encoder(x, lengths)
-
-        # streaming inference mode with individual chunks
-        states = None
-        for chunk_idx in range(num_chunks):
-            start_idx = chunk_idx * chunk_length
-            end_idx = start_idx + chunk_length
-            chunk = x[start_idx : end_idx + right_context_length]  # noqa
-            chunk_length = torch.tensor([chunk_length])
-            infer_output_chunk, infer_output_lengths, states = encoder.infer(
-                chunk, chunk_length, states
-            )
-            forward_output_chunk = forward_output[start_idx:end_idx]
-            assert torch.allclose(
-                infer_output_chunk,
-                forward_output_chunk,
-                atol=1e-5,
-                rtol=0.0,
-            )
-
-
-def test_emformer_infer_batch_single_consistency():
-    """Test consistency of cached states and output logits between single
-    utterance inference and batch inference."""
-    from emformer import Emformer
-
-    num_features = 80
-    output_dim = 1000
-    chunk_length = 8
-    num_chunks = 3
-    U = num_chunks * chunk_length
-    left_context_length, right_context_length = 128, 4
-    B, D = 2, 256
-    num_encoder_layers = 2
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-        model = Emformer(
-            num_features=num_features,
-            output_dim=output_dim,
-            chunk_length=chunk_length,
-            subsampling_factor=4,
-            d_model=D,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-            vgg_frontend=False,
-        )
-        model.eval()
-
-        def save_states(states):
-            saved_states = []
-            for layer_idx in range(len(states)):
-                layer_state = []
-                layer_state.append(states[layer_idx][0].clone())  # memory
-                layer_state.append(
-                    states[layer_idx][1].clone()
-                )  # left_context_key
-                layer_state.append(
-                    states[layer_idx][2].clone()
-                )  # left_context_val
-                layer_state.append(states[layer_idx][3].clone())  # past_length
-                saved_states.append(layer_state)
-            return saved_states
-
-        def assert_states_equal(saved_states, states, sample_idx):
-            for layer_idx in range(len(saved_states)):
-                # assert eqaul memory
-                assert torch.allclose(
-                    states[layer_idx][0],
-                    saved_states[layer_idx][0][
-                        :, sample_idx : sample_idx + 1  # noqa
-                    ],
-                    atol=1e-5,
-                    rtol=0.0,
-                )
-                # assert equal left_context_key
-                assert torch.allclose(
-                    states[layer_idx][1],
-                    saved_states[layer_idx][1][
-                        :, sample_idx : sample_idx + 1  # noqa
-                    ],
-                    atol=1e-5,
-                    rtol=0.0,
-                )
-                # assert equal left_context_val
-                assert torch.allclose(
-                    states[layer_idx][2],
-                    saved_states[layer_idx][2][
-                        :, sample_idx : sample_idx + 1  # noqa
-                    ],
-                    atol=1e-5,
-                    rtol=0.0,
-                )
-                # assert eqaul past_length
-                assert torch.equal(
-                    states[layer_idx][3],
-                    saved_states[layer_idx][3][
-                        :, sample_idx : sample_idx + 1  # noqa
-                    ],
-                )
-
-        x = torch.randn(B, U + right_context_length + 3, num_features)
-
-        # batch-wise inference
-        batch_logits = []
-        batch_states = []
-        states = None
-        for chunk_idx in range(num_chunks):
-            start_idx = chunk_idx * chunk_length
-            end_idx = start_idx + chunk_length
-            chunk = x[:, start_idx : end_idx + right_context_length + 3]  # noqa
-            lengths = torch.tensor(
-                [chunk_length + right_context_length + 3]
-            ).expand(B)
-            logits, output_lengths, states = model.infer(chunk, lengths, states)
-            batch_logits.append(logits)
-            batch_states.append(save_states(states))
-        batch_logits = torch.cat(batch_logits, dim=1)
-
-        # single-wise inference
-        single_logits = []
-        for sample_idx in range(B):
-            sample = x[sample_idx : sample_idx + 1]  # noqa
-            chunk_logits = []
-            states = None
-            for chunk_idx in range(num_chunks):
-                start_idx = chunk_idx * chunk_length
-                end_idx = start_idx + chunk_length
-                chunk = sample[
-                    :, start_idx : end_idx + right_context_length + 3
-                ]
-                lengths = torch.tensor(
-                    [chunk_length + right_context_length + 3]
-                )
-                logits, output_lengths, states = model.infer(
-                    chunk, lengths, states
-                )
-                chunk_logits.append(logits)
-                assert_states_equal(batch_states[chunk_idx], states, sample_idx)
-
-            chunk_logits = torch.cat(chunk_logits, dim=1)
-            single_logits.append(chunk_logits)
-
-        single_logits = torch.cat(single_logits, dim=0)
-
-        assert torch.allclose(batch_logits, single_logits, atol=1e-5, rtol=0.0)
-
-
-def test_emformer_infer_states_stack():
-    from emformer import Emformer, unstack_states, stack_states
-
-    num_features = 80
-    output_dim = 1000
-    chunk_length = 8
-    U = chunk_length
-    left_context_length, right_context_length = 128, 4
-    B, D = 2, 256
-    num_encoder_layers = 2
-    for use_memory in [True, False]:
-        if use_memory:
-            M = 3
-        else:
-            M = 0
-        model = Emformer(
-            num_features=num_features,
-            output_dim=output_dim,
-            chunk_length=chunk_length,
-            subsampling_factor=4,
-            d_model=D,
-            num_encoder_layers=num_encoder_layers,
-            left_context_length=left_context_length,
-            right_context_length=right_context_length,
-            max_memory_size=M,
-            vgg_frontend=False,
-        )
-
-        x = torch.randn(B, U + right_context_length + 3, num_features)
-        x_lens = torch.full((B,), U + right_context_length + 3)
-        logits, output_lengths, states = model.infer(
-            x,
-            x_lens,
-        )
-        states2 = stack_states(unstack_states(states))
-
-        for ss, ss2 in zip(states, states2):
-            for s, s2 in zip(ss, ss2):
-                assert torch.allclose(s, s2), f"{s.sum()}, {s2.sum()}"
-
-
-def test_rel_positional_encoding():
-    from emformer import RelPositionalEncoding
-
-    D = 256
-    pos_enc = RelPositionalEncoding(D, dropout_rate=0.1)
-    pos_len = 100
-    neg_len = 100
-    x = torch.randn(2, D)
-    x, pos_emb = pos_enc(x, pos_len, neg_len)
-    assert pos_emb.shape == (pos_len + neg_len - 1, D)
-
-
-if __name__ == "__main__":
-    test_emformer_attention_forward()
-    test_emformer_attention_infer()
-    test_emformer_layer_forward()
-    test_emformer_layer_infer()
-    test_emformer_encoder_forward()
-    test_emformer_encoder_infer()
-    test_emformer_forward()
-    test_emformer_infer()
-    # test_emformer_attention_forward_infer_consistency()
-    test_emformer_layer_forward_infer_consistency()
-    test_emformer_encoder_forward_infer_consistency()
-    test_emformer_infer_batch_single_consistency()
-    test_emformer_infer_states_stack()
-    test_rel_positional_encoding()
diff --git a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/train.py b/egs/librispeech/ASR/emformer_pruned_transducer_stateless/train.py
deleted file mode 100755
index 18a845a93..000000000
--- a/egs/librispeech/ASR/emformer_pruned_transducer_stateless/train.py
+++ /dev/null
@@ -1,1008 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
-#                                                  Wei Kang
-#                                                  Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./transducer_emformer/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 0 \
-  --exp-dir transducer_emformer/exp \
-  --full-libri 1 \
-  --max-duration 300
-"""
-
-
-import argparse
-import logging
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
-from decoder import Decoder
-from emformer import Emformer
-from joiner import Joiner
-from lhotse.cut import Cut
-from lhotse.dataset.sampling.base import CutSampler
-from lhotse.utils import fix_random_seed
-from model import Transducer
-from noam import Noam
-from torch import Tensor
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.utils import clip_grad_norm_
-from torch.utils.tensorboard import SummaryWriter
-
-from icefall.checkpoint import load_checkpoint, remove_checkpoints
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import save_checkpoint_with_global_batch_idx
-from icefall.dist import cleanup_dist, setup_dist
-from icefall.env import get_env_info
-from icefall.utils import (
-    AttributeDict,
-    MetricsTracker,
-    measure_gradient_norms,
-    measure_weight_norms,
-    optim_step_and_measure_param_change,
-    setup_logger,
-    str2bool,
-)
-
-
-def add_model_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument(
-        "--attention-dim",
-        type=int,
-        default=512,
-        help="Attention dim for the Emformer",
-    )
-
-    parser.add_argument(
-        "--nhead",
-        type=int,
-        default=8,
-        help="Number of attention heads for the Emformer",
-    )
-
-    parser.add_argument(
-        "--dim-feedforward",
-        type=int,
-        default=2048,
-        help="Feed-forward dimension for the Emformer",
-    )
-
-    parser.add_argument(
-        "--num-encoder-layers",
-        type=int,
-        default=12,
-        help="Number of encoder layers for the Emformer",
-    )
-
-    parser.add_argument(
-        "--left-context-length",
-        type=int,
-        default=120,
-        help="Number of frames for the left context in the Emformer",
-    )
-
-    parser.add_argument(
-        "--chunk-length",
-        type=int,
-        default=16,
-        help="Number of frames for each segment in the Emformer",
-    )
-
-    parser.add_argument(
-        "--right-context-length",
-        type=int,
-        default=4,
-        help="Number of frames for right context in the Emformer",
-    )
-
-    parser.add_argument(
-        "--memory-size",
-        type=int,
-        default=0,
-        help="Number of entries in the memory for the Emformer",
-    )
-
-    parser.add_argument(
-        "--tanh-on-mem",
-        type=str2bool,
-        default=False,
-        help="Whether to apply tanh on memory",
-    )
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--world-size",
-        type=int,
-        default=1,
-        help="Number of GPUs for DDP training.",
-    )
-
-    parser.add_argument(
-        "--master-port",
-        type=int,
-        default=12354,
-        help="Master port to use for DDP training.",
-    )
-
-    parser.add_argument(
-        "--tensorboard",
-        type=str2bool,
-        default=True,
-        help="Should various information be logged in tensorboard.",
-    )
-
-    parser.add_argument(
-        "--num-epochs",
-        type=int,
-        default=30,
-        help="Number of epochs to train.",
-    )
-
-    parser.add_argument(
-        "--start-epoch",
-        type=int,
-        default=0,
-        help="""Resume training from from this epoch.
-        If it is positive, it will load checkpoint from
-        transducer_emformer/exp/epoch-{start_epoch-1}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--start-batch",
-        type=int,
-        default=0,
-        help="""If positive, --start-epoch is ignored and
-        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="transducer_emformer/exp",
-        help="""The experiment dir.
-        It specifies the directory where all training related
-        files, e.g., checkpoints, log, etc, are saved
-        """,
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--lr-factor",
-        type=float,
-        default=5.0,
-        help="The lr_factor for Noam optimizer",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
-    )
-
-    parser.add_argument(
-        "--prune-range",
-        type=int,
-        default=5,
-        help="The prune range for rnnt loss, it means how many symbols(context)"
-        "we are using to compute the loss",
-    )
-
-    parser.add_argument(
-        "--lm-scale",
-        type=float,
-        default=0.25,
-        help="The scale to smooth the loss with lm "
-        "(output of prediction network) part.",
-    )
-
-    parser.add_argument(
-        "--am-scale",
-        type=float,
-        default=0.0,
-        help="The scale to smooth the loss with am (output of encoder network)"
-        "part.",
-    )
-
-    parser.add_argument(
-        "--simple-loss-scale",
-        type=float,
-        default=0.5,
-        help="To get pruning ranges, we will calculate a simple version"
-        "loss(joiner is just addition), this simple loss also uses for"
-        "training (as a regularization item). We will scale the simple loss"
-        "with this parameter before adding to the final loss.",
-    )
-
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="The seed for random generators intended for reproducibility",
-    )
-
-    parser.add_argument(
-        "--save-every-n",
-        type=int,
-        default=8000,
-        help="""Save checkpoint after processing this number of batches"
-        periodically. We save checkpoint to exp-dir/ whenever
-        params.batch_idx_train % save_every_n == 0. The checkpoint filename
-        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
-        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
-        end of each epoch where `xxx` is the epoch number counting from 0.
-        """,
-    )
-
-    parser.add_argument(
-        "--keep-last-k",
-        type=int,
-        default=20,
-        help="""Only keep this number of checkpoints on disk.
-        For instance, if it is 3, there are only 3 checkpoints
-        in the exp-dir with filenames `checkpoint-xxx.pt`.
-        It does not affect checkpoints with name `epoch-xxx.pt`.
-        """,
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    """Return a dict containing training parameters.
-
-    All training related parameters that are not passed from the commandline
-    are saved in the variable `params`.
-
-    Commandline options are merged into `params` after they are parsed, so
-    you can also access them via `params`.
-
-    Explanation of options saved in `params`:
-
-        - best_train_loss: Best training loss so far. It is used to select
-                           the model that has the lowest training loss. It is
-                           updated during the training.
-
-        - best_valid_loss: Best validation loss so far. It is used to select
-                           the model that has the lowest validation loss. It is
-                           updated during the training.
-
-        - best_train_epoch: It is the epoch that has the best training loss.
-
-        - best_valid_epoch: It is the epoch that has the best validation loss.
-
-        - batch_idx_train: Used to writing statistics to tensorboard. It
-                           contains number of batches trained so far across
-                           epochs.
-
-        - log_interval:  Print training loss if batch_idx % log_interval` is 0
-
-        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
-        - valid_interval:  Run validation if batch_idx % valid_interval is 0
-
-        - feature_dim: The model input dim. It has to match the one used
-                       in computing features.
-
-        - subsampling_factor:  The subsampling factor for the model.
-
-        - attention_dim: Hidden dim for multi-head attention model.
-
-        - num_decoder_layers: Number of decoder layer of transformer decoder.
-
-        - warm_step: The warm_step for Noam optimizer.
-    """
-    params = AttributeDict(
-        {
-            "best_train_loss": float("inf"),
-            "best_valid_loss": float("inf"),
-            "best_train_epoch": -1,
-            "best_valid_epoch": -1,
-            "batch_idx_train": 0,
-            "log_interval": 50,
-            "reset_interval": 200,
-            "valid_interval": 3000,  # For the 100h subset, use 800
-            "log_diagnostics": False,
-            # parameters for Emformer
-            "feature_dim": 80,
-            "subsampling_factor": 4,
-            "vgg_frontend": False,
-            # parameters for decoder
-            "embedding_dim": 512,
-            # parameters for Noam
-            "warm_step": 80000,  # For the 100h subset, use 20000
-            "env_info": get_env_info(),
-        }
-    )
-
-    return params
-
-
-def get_encoder_model(params: AttributeDict) -> nn.Module:
-    encoder = Emformer(
-        num_features=params.feature_dim,
-        output_dim=params.vocab_size,
-        subsampling_factor=params.subsampling_factor,
-        d_model=params.attention_dim,
-        nhead=params.nhead,
-        dim_feedforward=params.dim_feedforward,
-        num_encoder_layers=params.num_encoder_layers,
-        vgg_frontend=params.vgg_frontend,
-        left_context_length=params.left_context_length,
-        chunk_length=params.chunk_length,
-        right_context_length=params.right_context_length,
-        max_memory_size=params.memory_size,
-        tanh_on_mem=params.tanh_on_mem,
-    )
-    return encoder
-
-
-def get_decoder_model(params: AttributeDict) -> nn.Module:
-    decoder = Decoder(
-        vocab_size=params.vocab_size,
-        embedding_dim=params.embedding_dim,
-        blank_id=params.blank_id,
-        unk_id=params.unk_id,
-        context_size=params.context_size,
-    )
-    return decoder
-
-
-def get_joiner_model(params: AttributeDict) -> nn.Module:
-    joiner = Joiner(
-        input_dim=params.vocab_size,
-        inner_dim=params.embedding_dim,
-        output_dim=params.vocab_size,
-    )
-    return joiner
-
-
-def get_transducer_model(params: AttributeDict) -> nn.Module:
-    encoder = get_encoder_model(params)
-    decoder = get_decoder_model(params)
-    joiner = get_joiner_model(params)
-
-    model = Transducer(
-        encoder=encoder,
-        decoder=decoder,
-        joiner=joiner,
-    )
-    return model
-
-
-def load_checkpoint_if_available(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-) -> Optional[Dict[str, Any]]:
-    """Load checkpoint from file.
-
-    If params.start_batch is positive, it will load the checkpoint from
-    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
-    params.start_epoch is positive, it will load the checkpoint from
-    `params.start_epoch - 1`.
-
-    Apart from loading state dict for `model` and `optimizer` it also updates
-    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
-    and `best_valid_loss` in `params`.
-
-    Args:
-      params:
-        The return value of :func:`get_params`.
-      model:
-        The training model.
-      optimizer:
-        The optimizer that we are using.
-    Returns:
-      Return a dict containing previously saved training info.
-    """
-    if params.start_batch > 0:
-        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
-    elif params.start_epoch > 0:
-        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
-    else:
-        return None
-
-    assert filename.is_file(), f"{filename} does not exist!"
-
-    saved_params = load_checkpoint(
-        filename,
-        model=model,
-        optimizer=optimizer,
-    )
-
-    keys = [
-        "best_train_epoch",
-        "best_valid_epoch",
-        "batch_idx_train",
-        "best_train_loss",
-        "best_valid_loss",
-    ]
-    for k in keys:
-        params[k] = saved_params[k]
-
-    if params.start_batch > 0:
-        if "cur_epoch" in saved_params:
-            params["start_epoch"] = saved_params["cur_epoch"]
-
-        if "cur_batch_idx" in saved_params:
-            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
-
-    return saved_params
-
-
-def save_checkpoint(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    sampler: Optional[CutSampler] = None,
-    rank: int = 0,
-) -> None:
-    """Save model, optimizer, scheduler and training stats to file.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The training model.
-      optimizer:
-        The optimizer used in the training.
-      sampler:
-       The sampler for the training dataset.
-    """
-    if rank != 0:
-        return
-    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
-    save_checkpoint_impl(
-        filename=filename,
-        model=model,
-        params=params,
-        optimizer=optimizer,
-        sampler=sampler,
-        rank=rank,
-    )
-
-    if params.best_train_epoch == params.cur_epoch:
-        best_train_filename = params.exp_dir / "best-train-loss.pt"
-        copyfile(src=filename, dst=best_train_filename)
-
-    if params.best_valid_epoch == params.cur_epoch:
-        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
-        copyfile(src=filename, dst=best_valid_filename)
-
-
-def compute_loss(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    batch: dict,
-    is_training: bool,
-) -> Tuple[Tensor, MetricsTracker]:
-    """
-    Compute CTC loss given the model and its inputs.
-
-    Args:
-      params:
-        Parameters for training. See :func:`get_params`.
-      model:
-        The model for training. It is an instance of Emformer in our case.
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      is_training:
-        True for training. False for validation. When it is True, this
-        function enables autograd during computation; when it is False, it
-        disables autograd.
-    """
-    device = model.device
-    feature = batch["inputs"]
-    # at entry, feature is (N, T, C)
-    assert feature.ndim == 3
-    feature = feature.to(device)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    texts = batch["supervisions"]["text"]
-    y = sp.encode(texts, out_type=int)
-    y = k2.RaggedTensor(y).to(device)
-
-    with torch.set_grad_enabled(is_training):
-        simple_loss, pruned_loss = model(
-            x=feature,
-            x_lens=feature_lens,
-            y=y,
-            prune_range=params.prune_range,
-            am_scale=params.am_scale,
-            lm_scale=params.lm_scale,
-        )
-        loss = params.simple_loss_scale * simple_loss + pruned_loss
-
-    assert loss.requires_grad == is_training
-
-    info = MetricsTracker()
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        info["frames"] = (
-            (feature_lens // params.subsampling_factor).sum().item()
-        )
-
-    # Note: We use reduction=sum while computing the loss.
-    info["loss"] = loss.detach().cpu().item()
-    info["simple_loss"] = simple_loss.detach().cpu().item()
-    info["pruned_loss"] = pruned_loss.detach().cpu().item()
-
-    return loss, info
-
-
-def compute_validation_loss(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    valid_dl: torch.utils.data.DataLoader,
-    world_size: int = 1,
-) -> MetricsTracker:
-    """Run the validation process."""
-    model.eval()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(valid_dl):
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            sp=sp,
-            batch=batch,
-            is_training=False,
-        )
-        assert loss.requires_grad is False
-        tot_loss = tot_loss + loss_info
-
-    if world_size > 1:
-        tot_loss.reduce(loss.device)
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    if loss_value < params.best_valid_loss:
-        params.best_valid_epoch = params.cur_epoch
-        params.best_valid_loss = loss_value
-
-    return tot_loss
-
-
-def train_one_epoch(
-    params: AttributeDict,
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    sp: spm.SentencePieceProcessor,
-    train_dl: torch.utils.data.DataLoader,
-    valid_dl: torch.utils.data.DataLoader,
-    tb_writer: Optional[SummaryWriter] = None,
-    world_size: int = 1,
-    rank: int = 0,
-) -> None:
-    """Train the model for one epoch.
-
-    The training loss from the mean of all frames is saved in
-    `params.train_loss`. It runs the validation process every
-    `params.valid_interval` batches.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The model for training.
-      optimizer:
-        The optimizer we are using.
-      train_dl:
-        Dataloader for the training dataset.
-      valid_dl:
-        Dataloader for the validation dataset.
-      tb_writer:
-        Writer to write log messages to tensorboard.
-      world_size:
-        Number of nodes in DDP training. If it is 1, DDP is disabled.
-      rank:
-        The rank of the node in DDP training. If no DDP is used, it should
-        be set to 0.
-    """
-    model.train()
-
-    tot_loss = MetricsTracker()
-
-    def maybe_log_gradients(tag: str):
-        if (
-            params.log_diagnostics
-            and tb_writer is not None
-            and params.batch_idx_train % (params.log_interval * 5) == 0
-        ):
-            tb_writer.add_scalars(
-                tag,
-                measure_gradient_norms(model, norm="l2"),
-                global_step=params.batch_idx_train,
-            )
-
-    def maybe_log_weights(tag: str):
-        if (
-            params.log_diagnostics
-            and tb_writer is not None
-            and params.batch_idx_train % (params.log_interval * 5) == 0
-        ):
-            tb_writer.add_scalars(
-                tag,
-                measure_weight_norms(model, norm="l2"),
-                global_step=params.batch_idx_train,
-            )
-
-    def maybe_log_param_relative_changes():
-        if (
-            params.log_diagnostics
-            and tb_writer is not None
-            and params.batch_idx_train % (params.log_interval * 5) == 0
-        ):
-            deltas = optim_step_and_measure_param_change(model, optimizer)
-            tb_writer.add_scalars(
-                "train/relative_param_change_per_minibatch",
-                deltas,
-                global_step=params.batch_idx_train,
-            )
-        else:
-            optimizer.step()
-
-    cur_batch_idx = params.get("cur_batch_idx", 0)
-
-    for batch_idx, batch in enumerate(train_dl):
-        if batch_idx < cur_batch_idx:
-            continue
-        cur_batch_idx = batch_idx
-
-        params.batch_idx_train += 1
-        batch_size = len(batch["supervisions"]["text"])
-
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            sp=sp,
-            batch=batch,
-            is_training=True,
-        )
-        # summary stats
-        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-        # NOTE: We use reduction==sum and loss is computed over utterances
-        # in the batch and there is no normalization to it so far.
-
-        loss.backward()
-
-        maybe_log_weights("train/param_norms")
-        maybe_log_gradients("train/grad_norms")
-        maybe_log_param_relative_changes()
-
-        optimizer.zero_grad()
-
-        if (
-            params.batch_idx_train > 0
-            and params.batch_idx_train % params.save_every_n == 0
-        ):
-            params.cur_batch_idx = batch_idx
-            save_checkpoint_with_global_batch_idx(
-                out_dir=params.exp_dir,
-                global_batch_idx=params.batch_idx_train,
-                model=model,
-                params=params,
-                optimizer=optimizer,
-                sampler=train_dl.sampler,
-                rank=rank,
-            )
-            del params.cur_batch_idx
-            remove_checkpoints(
-                out_dir=params.exp_dir,
-                topk=params.keep_last_k,
-                rank=rank,
-            )
-
-        if batch_idx % params.log_interval == 0:
-            logging.info(
-                f"Epoch {params.cur_epoch}, "
-                f"batch {batch_idx}, loss[{loss_info}], "
-                f"tot_loss[{tot_loss}], batch size: {batch_size}"
-            )
-
-            if tb_writer is not None:
-                loss_info.write_summary(
-                    tb_writer, "train/current_", params.batch_idx_train
-                )
-                tot_loss.write_summary(
-                    tb_writer, "train/tot_", params.batch_idx_train
-                )
-
-        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
-            logging.info("Computing validation loss")
-            valid_info = compute_validation_loss(
-                params=params,
-                model=model,
-                sp=sp,
-                valid_dl=valid_dl,
-                world_size=world_size,
-            )
-            model.train()
-            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
-            if tb_writer is not None:
-                valid_info.write_summary(
-                    tb_writer, "train/valid_", params.batch_idx_train
-                )
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    params.train_loss = loss_value
-    if params.train_loss < params.best_train_loss:
-        params.best_train_epoch = params.cur_epoch
-        params.best_train_loss = params.train_loss
-
-
-def run(rank, world_size, args):
-    """
-    Args:
-      rank:
-        It is a value between 0 and `world_size-1`, which is
-        passed automatically by `mp.spawn()` in :func:`main`.
-        The node with rank 0 is responsible for saving checkpoint.
-      world_size:
-        Number of GPUs for DDP training.
-      args:
-        The return value of get_parser().parse_args()
-    """
-    params = get_params()
-    params.update(vars(args))
-    if params.full_libri is False:
-        params.valid_interval = 800
-        params.warm_step = 20000
-
-    fix_random_seed(params.seed)
-    if world_size > 1:
-        setup_dist(rank, world_size, params.master_port)
-
-    setup_logger(f"{params.exp_dir}/log/log-train")
-    logging.info("Training started")
-
-    if args.tensorboard and rank == 0:
-        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
-    else:
-        tb_writer = None
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", rank)
-    logging.info(f"Device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_transducer_model(params)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    checkpoints = load_checkpoint_if_available(params=params, model=model)
-
-    model.to(device)
-    if world_size > 1:
-        logging.info("Using DDP")
-        model = DDP(model, device_ids=[rank])
-    model.device = device
-
-    optimizer = Noam(
-        model.parameters(),
-        model_size=params.attention_dim,
-        factor=params.lr_factor,
-        warm_step=params.warm_step,
-    )
-
-    if checkpoints and "optimizer" in checkpoints:
-        logging.info("Loading optimizer state dict")
-        optimizer.load_state_dict(checkpoints["optimizer"])
-
-    librispeech = LibriSpeechAsrDataModule(args)
-
-    train_cuts = librispeech.train_clean_100_cuts()
-    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
-
-    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 20 seconds
-        #
-        # Caution: There is a reason to select 20.0 here. Please see
-        # ../local/display_manifest_statistics.py
-        #
-        # You should use ../local/display_manifest_statistics.py to get
-        # an utterance duration distribution for your dataset to select
-        # the threshold
-        return 1.0 <= c.duration <= 20.0
-
-    num_in_total = len(train_cuts)
-
-    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-
-    num_left = len(train_cuts)
-    num_removed = num_in_total - num_left
-    removed_percent = num_removed / num_in_total * 100
-
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
-    logging.info(f"After removing short and long utterances: {num_left}")
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
-
-    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
-        # We only load the sampler's state dict when it loads a checkpoint
-        # saved in the middle of an epoch
-        sampler_state_dict = checkpoints["sampler"]
-    else:
-        sampler_state_dict = None
-
-    train_dl = librispeech.train_dataloaders(
-        train_cuts, sampler_state_dict=sampler_state_dict
-    )
-
-    valid_cuts = librispeech.dev_clean_cuts()
-    valid_cuts += librispeech.dev_other_cuts()
-    valid_dl = librispeech.valid_dataloaders(valid_cuts)
-
-    scan_pessimistic_batches_for_oom(
-        model=model,
-        train_dl=train_dl,
-        optimizer=optimizer,
-        sp=sp,
-        params=params,
-    )
-
-    for epoch in range(params.start_epoch, params.num_epochs):
-        fix_random_seed(params.seed + epoch)
-        train_dl.sampler.set_epoch(epoch)
-
-        cur_lr = optimizer._rate
-        if tb_writer is not None:
-            tb_writer.add_scalar(
-                "train/learning_rate", cur_lr, params.batch_idx_train
-            )
-            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
-        if rank == 0:
-            logging.info("epoch {}, learning rate {}".format(epoch, cur_lr))
-
-        params.cur_epoch = epoch
-
-        train_one_epoch(
-            params=params,
-            model=model,
-            optimizer=optimizer,
-            sp=sp,
-            train_dl=train_dl,
-            valid_dl=valid_dl,
-            tb_writer=tb_writer,
-            world_size=world_size,
-            rank=rank,
-        )
-
-        save_checkpoint(
-            params=params,
-            model=model,
-            optimizer=optimizer,
-            sampler=train_dl.sampler,
-            rank=rank,
-        )
-
-    logging.info("Done!")
-
-    if world_size > 1:
-        torch.distributed.barrier()
-        cleanup_dist()
-
-
-def scan_pessimistic_batches_for_oom(
-    model: nn.Module,
-    train_dl: torch.utils.data.DataLoader,
-    optimizer: torch.optim.Optimizer,
-    sp: spm.SentencePieceProcessor,
-    params: AttributeDict,
-):
-    from lhotse.dataset import find_pessimistic_batches
-
-    logging.info(
-        "Sanity check -- see if any of the batches in epoch 0 would cause OOM."
-    )
-    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
-    for criterion, cuts in batches.items():
-        batch = train_dl.dataset[cuts]
-        try:
-            optimizer.zero_grad()
-            loss, _ = compute_loss(
-                params=params,
-                model=model,
-                sp=sp,
-                batch=batch,
-                is_training=True,
-            )
-            loss.backward()
-            clip_grad_norm_(model.parameters(), 5.0, 2.0)
-            optimizer.step()
-        except RuntimeError as e:
-            if "CUDA out of memory" in str(e):
-                logging.error(
-                    "Your GPU ran out of memory with the current "
-                    "max_duration setting. We recommend decreasing "
-                    "max_duration and trying again.\n"
-                    f"Failing criterion: {criterion} "
-                    f"(={crit_values[criterion]}) ..."
-                )
-            raise
-
-
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    world_size = args.world_size
-    assert world_size >= 1
-    if world_size > 1:
-        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
-    else:
-        run(rank=0, world_size=1, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()