diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/asr_datamodule.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/asr_datamodule.py
deleted file mode 100644
index 8dd1459ca..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/asr_datamodule.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import inspect
-import logging
-from functools import lru_cache
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
-from lhotse.dataset import (
-    BucketingSampler,
-    CutConcatenate,
-    CutMix,
-    K2SpeechRecognitionDataset,
-    PrecomputedFeatures,
-    SingleCutSampler,
-    SpecAugment,
-)
-from lhotse.dataset.input_strategies import OnTheFlyFeatures
-from lhotse.utils import fix_random_seed
-from torch.utils.data import DataLoader
-
-from icefall.utils import str2bool
-
-
-class _SeedWorkers:
-    def __init__(self, seed: int):
-        self.seed = seed
-
-    def __call__(self, worker_id: int):
-        fix_random_seed(self.seed + worker_id)
-
-
-class LibriSpeechAsrDataModule:
-    """
-    DataModule for k2 ASR experiments.
-    It assumes there is always one train and valid dataloader,
-    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
-    and test-other).
-
-    It contains all the common data pipeline modules used in ASR
-    experiments, e.g.:
-    - dynamic batch size,
-    - bucketing samplers,
-    - cut concatenation,
-    - augmentation,
-    - on-the-fly feature extraction
-
-    This class should be derived for specific corpora used in ASR tasks.
-    """
-
-    def __init__(self, args: argparse.Namespace):
-        self.args = args
-
-    @classmethod
-    def add_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(
-            title="ASR data related options",
-            description="These options are used for the preparation of "
-            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
-            "effective batch sizes, sampling strategies, applied data "
-            "augmentations, etc.",
-        )
-        group.add_argument(
-            "--full-libri",
-            type=str2bool,
-            default=True,
-            help="When enabled, use 960h LibriSpeech. "
-            "Otherwise, use 100h subset.",
-        )
-        group.add_argument(
-            "--manifest-dir",
-            type=Path,
-            default=Path("data/fbank"),
-            help="Path to directory with train/valid/test cuts.",
-        )
-        group.add_argument(
-            "--max-duration",
-            type=int,
-            default=200.0,
-            help="Maximum pooled recordings duration (seconds) in a "
-            "single batch. You can reduce it if it causes CUDA OOM.",
-        )
-        group.add_argument(
-            "--bucketing-sampler",
-            type=str2bool,
-            default=True,
-            help="When enabled, the batches will come from buckets of "
-            "similar duration (saves padding frames).",
-        )
-        group.add_argument(
-            "--num-buckets",
-            type=int,
-            default=30,
-            help="The number of buckets for the BucketingSampler"
-            "(you might want to increase it for larger datasets).",
-        )
-        group.add_argument(
-            "--concatenate-cuts",
-            type=str2bool,
-            default=False,
-            help="When enabled, utterances (cuts) will be concatenated "
-            "to minimize the amount of padding.",
-        )
-        group.add_argument(
-            "--duration-factor",
-            type=float,
-            default=1.0,
-            help="Determines the maximum duration of a concatenated cut "
-            "relative to the duration of the longest cut in a batch.",
-        )
-        group.add_argument(
-            "--gap",
-            type=float,
-            default=1.0,
-            help="The amount of padding (in seconds) inserted between "
-            "concatenated cuts. This padding is filled with noise when "
-            "noise augmentation is used.",
-        )
-        group.add_argument(
-            "--on-the-fly-feats",
-            type=str2bool,
-            default=False,
-            help="When enabled, use on-the-fly cut mixing and feature "
-            "extraction. Will drop existing precomputed feature manifests "
-            "if available.",
-        )
-        group.add_argument(
-            "--shuffle",
-            type=str2bool,
-            default=True,
-            help="When enabled (=default), the examples will be "
-            "shuffled for each epoch.",
-        )
-        group.add_argument(
-            "--return-cuts",
-            type=str2bool,
-            default=True,
-            help="When enabled, each batch will have the "
-            "field: batch['supervisions']['cut'] with the cuts that "
-            "were used to construct it.",
-        )
-
-        group.add_argument(
-            "--num-workers",
-            type=int,
-            default=2,
-            help="The number of training dataloader workers that "
-            "collect the batches.",
-        )
-
-        group.add_argument(
-            "--enable-spec-aug",
-            type=str2bool,
-            default=True,
-            help="When enabled, use SpecAugment for training dataset.",
-        )
-
-        group.add_argument(
-            "--spec-aug-time-warp-factor",
-            type=int,
-            default=80,
-            help="Used only when --enable-spec-aug is True. "
-            "It specifies the factor for time warping in SpecAugment. "
-            "Larger values mean more warping. "
-            "A value less than 1 means to disable time warp.",
-        )
-
-        group.add_argument(
-            "--enable-musan",
-            type=str2bool,
-            default=True,
-            help="When enabled, select noise from MUSAN and mix it"
-            "with training dataset. ",
-        )
-
-    def train_dataloaders(
-        self,
-        cuts_train: CutSet,
-        sampler_state_dict: Optional[Dict[str, Any]] = None,
-    ) -> DataLoader:
-        """
-        Args:
-          cuts_train:
-            CutSet for training.
-          sampler_state_dict:
-            The state dict for the training sampler.
-        """
-        transforms = []
-        if self.args.enable_musan:
-            logging.info("Enable MUSAN")
-            logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
-            )
-            transforms.append(
-                CutMix(
-                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
-                )
-            )
-        else:
-            logging.info("Disable MUSAN")
-
-        if self.args.concatenate_cuts:
-            logging.info(
-                f"Using cut concatenation with duration factor "
-                f"{self.args.duration_factor} and gap {self.args.gap}."
-            )
-            # Cut concatenation should be the first transform in the list,
-            # so that if we e.g. mix noise in, it will fill the gaps between
-            # different utterances.
-            transforms = [
-                CutConcatenate(
-                    duration_factor=self.args.duration_factor, gap=self.args.gap
-                )
-            ] + transforms
-
-        input_transforms = []
-        if self.args.enable_spec_aug:
-            logging.info("Enable SpecAugment")
-            logging.info(
-                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
-            )
-            # Set the value of num_frame_masks according to Lhotse's version.
-            # In different Lhotse's versions, the default of num_frame_masks is
-            # different.
-            num_frame_masks = 10
-            num_frame_masks_parameter = inspect.signature(
-                SpecAugment.__init__
-            ).parameters["num_frame_masks"]
-            if num_frame_masks_parameter.default == 1:
-                num_frame_masks = 2
-            logging.info(f"Num frame mask: {num_frame_masks}")
-            input_transforms.append(
-                SpecAugment(
-                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=num_frame_masks,
-                    features_mask_size=27,
-                    num_feature_masks=2,
-                    frames_mask_size=100,
-                )
-            )
-        else:
-            logging.info("Disable SpecAugment")
-
-        logging.info("About to create train dataset")
-        train = K2SpeechRecognitionDataset(
-            cut_transforms=transforms,
-            input_transforms=input_transforms,
-            return_cuts=self.args.return_cuts,
-        )
-
-        if self.args.on_the_fly_feats:
-            # NOTE: the PerturbSpeed transform should be added only if we
-            # remove it from data prep stage.
-            # Add on-the-fly speed perturbation; since originally it would
-            # have increased epoch size by 3, we will apply prob 2/3 and use
-            # 3x more epochs.
-            # Speed perturbation probably should come first before
-            # concatenation, but in principle the transforms order doesn't have
-            # to be strict (e.g. could be randomized)
-            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
-            # Drop feats to be on the safe side.
-            train = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(
-                    Fbank(FbankConfig(num_mel_bins=80))
-                ),
-                input_transforms=input_transforms,
-                return_cuts=self.args.return_cuts,
-            )
-
-        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
-            )
-        else:
-            logging.info("Using SingleCutSampler.")
-            train_sampler = SingleCutSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-            )
-        logging.info("About to create train dataloader")
-
-        if sampler_state_dict is not None:
-            logging.info("Loading sampler state dict")
-            train_sampler.load_state_dict(sampler_state_dict)
-
-        # 'seed' is derived from the current random state, which will have
-        # previously been set in the main process.
-        seed = torch.randint(0, 100000, ()).item()
-        worker_init_fn = _SeedWorkers(seed)
-
-        train_dl = DataLoader(
-            train,
-            sampler=train_sampler,
-            batch_size=None,
-            num_workers=self.args.num_workers,
-            persistent_workers=False,
-            worker_init_fn=worker_init_fn,
-        )
-
-        return train_dl
-
-    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
-        transforms = []
-        if self.args.concatenate_cuts:
-            transforms = [
-                CutConcatenate(
-                    duration_factor=self.args.duration_factor, gap=self.args.gap
-                )
-            ] + transforms
-
-        logging.info("About to create dev dataset")
-        if self.args.on_the_fly_feats:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(
-                    Fbank(FbankConfig(num_mel_bins=80))
-                ),
-                return_cuts=self.args.return_cuts,
-            )
-        else:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                return_cuts=self.args.return_cuts,
-            )
-        valid_sampler = BucketingSampler(
-            cuts_valid,
-            max_duration=self.args.max_duration,
-            shuffle=False,
-        )
-        logging.info("About to create dev dataloader")
-        valid_dl = DataLoader(
-            validate,
-            sampler=valid_sampler,
-            batch_size=None,
-            num_workers=2,
-            persistent_workers=False,
-        )
-
-        return valid_dl
-
-    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
-        logging.debug("About to create test dataset")
-        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
-            return_cuts=self.args.return_cuts,
-        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
-        )
-        logging.debug("About to create test dataloader")
-        test_dl = DataLoader(
-            test,
-            batch_size=None,
-            sampler=sampler,
-            num_workers=self.args.num_workers,
-        )
-        return test_dl
-
-    @lru_cache()
-    def train_clean_100_cuts(self) -> CutSet:
-        logging.info("About to get train-clean-100 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-100.json.gz"
-        )
-
-    @lru_cache()
-    def train_clean_360_cuts(self) -> CutSet:
-        logging.info("About to get train-clean-360 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-360.json.gz"
-        )
-
-    @lru_cache()
-    def train_other_500_cuts(self) -> CutSet:
-        logging.info("About to get train-other-500 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-other-500.json.gz"
-        )
-
-    @lru_cache()
-    def dev_clean_cuts(self) -> CutSet:
-        logging.info("About to get dev-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
-
-    @lru_cache()
-    def dev_other_cuts(self) -> CutSet:
-        logging.info("About to get dev-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
-
-    @lru_cache()
-    def test_clean_cuts(self) -> CutSet:
-        logging.info("About to get test-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
-
-    @lru_cache()
-    def test_other_cuts(self) -> CutSet:
-        logging.info("About to get test-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/asr_datamodule.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/asr_datamodule.py
new file mode 120000
index 000000000..a074d6085
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/asr_datamodule.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/beam_search.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/beam_search.py
deleted file mode 100644
index 7c936b257..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/beam_search.py
+++ /dev/null
@@ -1,977 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from dataclasses import dataclass
-from typing import Dict, List, Optional
-
-import k2
-import torch
-from model import Transducer
-
-from icefall.decode import Nbest, one_best_decoding
-from icefall.utils import get_texts
-
-
-def fast_beam_search_one_best(
-    model: Transducer,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-) -> List[List[int]]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first obtained using modified beam search, and then
-    the shortest path within the lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-    Returns:
-      Return the decoded result.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-    )
-
-    best_path = one_best_decoding(lattice)
-    hyps = get_texts(best_path)
-    return hyps
-
-
-def fast_beam_search_nbest_oracle(
-    model: Transducer,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    num_paths: int,
-    ref_texts: List[List[int]],
-    use_double_scores: bool = True,
-    nbest_scale: float = 0.5,
-) -> List[List[int]]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first obtained using modified beam search, and then
-    we select `num_paths` linear paths from the lattice. The path
-    that has the minimum edit distance with the given reference transcript
-    is used as the output.
-
-    This is the best result we can achieve for any nbest based rescoring
-    methods.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      ref_texts:
-        A list-of-list of integers containing the reference transcripts.
-        If the decoding_graph is a trivial_graph, the integer ID is the
-        BPE token ID.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-
-    Returns:
-      Return the decoded result.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-
-    hyps = nbest.build_levenshtein_graphs()
-    refs = k2.levenshtein_graph(ref_texts, device=hyps.device)
-
-    levenshtein_alignment = k2.levenshtein_alignment(
-        refs=refs,
-        hyps=hyps,
-        hyp_to_ref_map=nbest.shape.row_ids(1),
-        sorted_match_ref=True,
-    )
-
-    tot_scores = levenshtein_alignment.get_tot_scores(
-        use_double_scores=False, log_semiring=False
-    )
-    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-
-    max_indexes = ragged_tot_scores.argmax()
-
-    best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-    hyps = get_texts(best_path)
-    return hyps
-
-
-def fast_beam_search(
-    model: Transducer,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-) -> k2.Fsa:
-    """It limits the maximum number of symbols per frame to 1.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-    Returns:
-      Return an FsaVec with axes [utt][state][arc] containing the decoded
-      lattice. Note: When the input graph is a TrivialGraph, the returned
-      lattice is actually an acceptor.
-    """
-    assert encoder_out.ndim == 3
-
-    context_size = model.decoder.context_size
-    vocab_size = model.decoder.vocab_size
-
-    B, T, C = encoder_out.shape
-
-    config = k2.RnntDecodingConfig(
-        vocab_size=vocab_size,
-        decoder_history_len=context_size,
-        beam=beam,
-        max_contexts=max_contexts,
-        max_states=max_states,
-    )
-    individual_streams = []
-    for i in range(B):
-        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
-    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # shape is a RaggedShape of shape (B, context)
-        # contexts is a Tensor of shape (shape.NumElements(), context_size)
-        shape, contexts = decoding_streams.get_contexts()
-        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
-        contexts = contexts.to(torch.int64)
-        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
-        decoder_out = model.decoder(contexts, need_pad=False)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # current_encoder_out is of shape
-        # (shape.NumElements(), 1, joiner_dim)
-        # fmt: off
-        current_encoder_out = torch.index_select(
-            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
-        )
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out.unsqueeze(2),
-            decoder_out.unsqueeze(1),
-            project_input=False,
-        )
-        logits = logits.squeeze(1).squeeze(1)
-        log_probs = logits.log_softmax(dim=-1)
-        decoding_streams.advance(log_probs)
-    decoding_streams.terminate_and_flush_to_streams()
-    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
-
-    return lattice
-
-
-def greedy_search(
-    model: Transducer, encoder_out: torch.Tensor, max_sym_per_frame: int
-) -> List[int]:
-    """Greedy search for a single utterance.
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      max_sym_per_frame:
-        Maximum number of symbols per frame. If it is set to 0, the WER
-        would be 100%.
-    Returns:
-      Return the decoded result.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-    unk_id = getattr(model, "unk_id", blank_id)
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [blank_id] * context_size, device=device, dtype=torch.int64
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-    hyp = [blank_id] * context_size
-
-    # Maximum symbols per utterance.
-    max_sym_per_utt = 1000
-
-    # symbols per frame
-    sym_per_frame = 0
-
-    # symbols per utterance decoded so far
-    sym_per_utt = 0
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        if sym_per_frame >= max_sym_per_frame:
-            sym_per_frame = 0
-            t += 1
-            continue
-
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits is (1, 1, 1, vocab_size)
-
-        y = logits.argmax().item()
-        if y not in (blank_id, unk_id):
-            hyp.append(y)
-            decoder_input = torch.tensor(
-                [hyp[-context_size:]], device=device
-            ).reshape(1, context_size)
-
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-            sym_per_utt += 1
-            sym_per_frame += 1
-        else:
-            sym_per_frame = 0
-            t += 1
-    hyp = hyp[context_size:]  # remove blanks
-
-    return hyp
-
-
-def greedy_search_batch(
-    model: Transducer,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-) -> List[List[int]]:
-    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C), where N >= 1.
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-    Returns:
-      Return a list-of-list of token IDs containing the decoded results.
-      len(ans) equals to encoder_out.size(0).
-    """
-    assert encoder_out.ndim == 3
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    device = next(model.parameters()).device
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    hyps = [[blank_id] * context_size for _ in range(N)]
-
-    decoder_input = torch.tensor(
-        hyps,
-        device=device,
-        dtype=torch.int64,
-    )  # (N, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-    # decoder_out: (N, 1, decoder_out_dim)
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    for batch_size in batch_size_list:
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        decoder_out = decoder_out[:batch_size]
-
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits'shape (batch_size, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
-        assert logits.ndim == 2, logits.shape
-        y = logits.argmax(dim=1).tolist()
-        emitted = False
-        for i, v in enumerate(y):
-            if v not in (blank_id, unk_id):
-                hyps[i].append(v)
-                emitted = True
-        if emitted:
-            # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
-            decoder_input = torch.tensor(
-                decoder_input,
-                device=device,
-                dtype=torch.int64,
-            )
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    sorted_ans = [h[context_size:] for h in hyps]
-    ans = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-
-    return ans
-
-
-@dataclass
-class Hypothesis:
-    # The predicted tokens so far.
-    # Newly predicted tokens are appended to `ys`.
-    ys: List[int]
-
-    # The log prob of ys.
-    # It contains only one entry.
-    log_prob: torch.Tensor
-
-    @property
-    def key(self) -> str:
-        """Return a string representation of self.ys"""
-        return "_".join(map(str, self.ys))
-
-
-class HypothesisList(object):
-    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
-        """
-        Args:
-          data:
-            A dict of Hypotheses. Its key is its `value.key`.
-        """
-        if data is None:
-            self._data = {}
-        else:
-            self._data = data
-
-    @property
-    def data(self) -> Dict[str, Hypothesis]:
-        return self._data
-
-    def add(self, hyp: Hypothesis) -> None:
-        """Add a Hypothesis to `self`.
-
-        If `hyp` already exists in `self`, its probability is updated using
-        `log-sum-exp` with the existed one.
-
-        Args:
-          hyp:
-            The hypothesis to be added.
-        """
-        key = hyp.key
-        if key in self:
-            old_hyp = self._data[key]  # shallow copy
-            torch.logaddexp(
-                old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob
-            )
-        else:
-            self._data[key] = hyp
-
-    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
-        """Get the most probable hypothesis, i.e., the one with
-        the largest `log_prob`.
-
-        Args:
-          length_norm:
-            If True, the `log_prob` of a hypothesis is normalized by the
-            number of tokens in it.
-        Returns:
-          Return the hypothesis that has the largest `log_prob`.
-        """
-        if length_norm:
-            return max(
-                self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys)
-            )
-        else:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
-
-    def remove(self, hyp: Hypothesis) -> None:
-        """Remove a given hypothesis.
-
-        Caution:
-          `self` is modified **in-place**.
-
-        Args:
-          hyp:
-            The hypothesis to be removed from `self`.
-            Note: It must be contained in `self`. Otherwise,
-            an exception is raised.
-        """
-        key = hyp.key
-        assert key in self, f"{key} does not exist"
-        del self._data[key]
-
-    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
-        """Remove all Hypotheses whose log_prob is less than threshold.
-
-        Caution:
-          `self` is not modified. Instead, a new HypothesisList is returned.
-
-        Returns:
-          Return a new HypothesisList containing all hypotheses from `self`
-          with `log_prob` being greater than the given `threshold`.
-        """
-        ans = HypothesisList()
-        for _, hyp in self._data.items():
-            if hyp.log_prob > threshold:
-                ans.add(hyp)  # shallow copy
-        return ans
-
-    def topk(self, k: int) -> "HypothesisList":
-        """Return the top-k hypothesis."""
-        hyps = list(self._data.items())
-
-        hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
-
-        ans = HypothesisList(dict(hyps))
-        return ans
-
-    def __contains__(self, key: str):
-        return key in self._data
-
-    def __iter__(self):
-        return iter(self._data.values())
-
-    def __len__(self) -> int:
-        return len(self._data)
-
-    def __str__(self) -> str:
-        s = []
-        for key in self:
-            s.append(key)
-        return ", ".join(s)
-
-
-def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
-    """Return a ragged shape with axes [utt][num_hyps].
-
-    Args:
-      hyps:
-        len(hyps) == batch_size. It contains the current hypothesis for
-        each utterance in the batch.
-    Returns:
-      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
-      the shape is on CPU.
-    """
-    num_hyps = [len(h) for h in hyps]
-
-    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
-    # to get exclusive sum later.
-    num_hyps.insert(0, 0)
-
-    num_hyps = torch.tensor(num_hyps)
-    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
-    ans = k2.ragged.create_ragged_shape2(
-        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
-    )
-    return ans
-
-
-def modified_beam_search(
-    model: Transducer,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: int = 4,
-) -> List[List[int]]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[blank_id] * context_size,
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for batch_size in batch_size_list:
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(
-            shape=log_probs_shape, value=log_probs
-        )
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    ans = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-
-    return ans
-
-
-def _deprecated_modified_beam_search(
-    model: Transducer,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-) -> List[int]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    It decodes only one utterance at a time. We keep it only for reference.
-    The function :func:`modified_beam_search` should be preferred as it
-    supports batch decoding.
-
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-    Returns:
-      Return the decoded result.
-    """
-
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    device = next(model.parameters()).device
-
-    T = encoder_out.size(1)
-
-    B = HypothesisList()
-    B.add(
-        Hypothesis(
-            ys=[blank_id] * context_size,
-            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-        )
-    )
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # current_encoder_out is of shape (1, 1, 1, encoder_out_dim)
-        # fmt: on
-        A = list(B)
-        B = HypothesisList()
-
-        ys_log_probs = torch.cat([hyp.log_prob.reshape(1, 1) for hyp in A])
-        # ys_log_probs is of shape (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyp in A],
-            device=device,
-            dtype=torch.int64,
-        )
-        # decoder_input is of shape (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_output is of shape (num_hyps, 1, 1, joiner_dim)
-
-        current_encoder_out = current_encoder_out.expand(
-            decoder_out.size(0), 1, 1, -1
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )
-        # logits is of shape (num_hyps, 1, 1, vocab_size)
-        logits = logits.squeeze(1).squeeze(1)
-
-        # now logits is of shape (num_hyps, vocab_size)
-        log_probs = logits.log_softmax(dim=-1)
-
-        log_probs.add_(ys_log_probs)
-
-        log_probs = log_probs.reshape(-1)
-        topk_log_probs, topk_indexes = log_probs.topk(beam)
-
-        # topk_hyp_indexes are indexes into `A`
-        topk_hyp_indexes = topk_indexes // logits.size(-1)
-        topk_token_indexes = topk_indexes % logits.size(-1)
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            topk_hyp_indexes = topk_hyp_indexes.tolist()
-            topk_token_indexes = topk_token_indexes.tolist()
-
-        for i in range(len(topk_hyp_indexes)):
-            hyp = A[topk_hyp_indexes[i]]
-            new_ys = hyp.ys[:]
-            new_token = topk_token_indexes[i]
-            if new_token not in (blank_id, unk_id):
-                new_ys.append(new_token)
-            new_log_prob = topk_log_probs[i]
-            new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
-            B.add(new_hyp)
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-
-    return ys
-
-
-def beam_search(
-    model: Transducer,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-) -> List[int]:
-    """
-    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
-
-    espnet/nets/beam_search_transducer.py#L247 is used as a reference.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-    Returns:
-      Return the decoded result.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [blank_id] * context_size,
-        device=device,
-        dtype=torch.int64,
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-
-    B = HypothesisList()
-    B.add(Hypothesis(ys=[blank_id] * context_size, log_prob=0.0))
-
-    max_sym_per_utt = 20000
-
-    sym_per_utt = 0
-
-    decoder_cache: Dict[str, torch.Tensor] = {}
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        A = B
-        B = HypothesisList()
-
-        joint_cache: Dict[str, torch.Tensor] = {}
-
-        # TODO(fangjun): Implement prefix search to update the `log_prob`
-        # of hypotheses in A
-
-        while True:
-            y_star = A.get_most_probable()
-            A.remove(y_star)
-
-            cached_key = y_star.key
-
-            if cached_key not in decoder_cache:
-                decoder_input = torch.tensor(
-                    [y_star.ys[-context_size:]],
-                    device=device,
-                    dtype=torch.int64,
-                ).reshape(1, context_size)
-
-                decoder_out = model.decoder(decoder_input, need_pad=False)
-                decoder_out = model.joiner.decoder_proj(decoder_out)
-                decoder_cache[cached_key] = decoder_out
-            else:
-                decoder_out = decoder_cache[cached_key]
-
-            cached_key += f"-t-{t}"
-            if cached_key not in joint_cache:
-                logits = model.joiner(
-                    current_encoder_out,
-                    decoder_out.unsqueeze(1),
-                    project_input=False,
-                )
-
-                # TODO(fangjun): Scale the blank posterior
-                log_prob = logits.log_softmax(dim=-1)
-                # log_prob is (1, 1, 1, vocab_size)
-                log_prob = log_prob.squeeze()
-                # Now log_prob is (vocab_size,)
-                joint_cache[cached_key] = log_prob
-            else:
-                log_prob = joint_cache[cached_key]
-
-            # First, process the blank symbol
-            skip_log_prob = log_prob[blank_id]
-            new_y_star_log_prob = y_star.log_prob + skip_log_prob
-
-            # ys[:] returns a copy of ys
-            B.add(Hypothesis(ys=y_star.ys[:], log_prob=new_y_star_log_prob))
-
-            # Second, process other non-blank labels
-            values, indices = log_prob.topk(beam + 1)
-            for i, v in zip(indices.tolist(), values.tolist()):
-                if i in (blank_id, unk_id):
-                    continue
-                new_ys = y_star.ys + [i]
-                new_log_prob = y_star.log_prob + v
-                A.add(Hypothesis(ys=new_ys, log_prob=new_log_prob))
-
-            # Check whether B contains more than "beam" elements more probable
-            # than the most probable in A
-            A_most_probable = A.get_most_probable()
-
-            kept_B = B.filter(A_most_probable.log_prob)
-
-            if len(kept_B) >= beam:
-                B = kept_B.topk(beam)
-                break
-
-        t += 1
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-    return ys
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/beam_search.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/beam_search.py
new file mode 120000
index 000000000..8554e44cc
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/beam_search.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/beam_search.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/decoder.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/decoder.py
deleted file mode 100644
index b6d94aaf1..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/decoder.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from scaling import ScaledConv1d, ScaledEmbedding
-
-
-class Decoder(nn.Module):
-    """This class modifies the stateless decoder from the following paper:
-
-        RNN-transducer with stateless prediction network
-        https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
-
-    It removes the recurrent connection from the decoder, i.e., the prediction
-    network. Different from the above paper, it adds an extra Conv1d
-    right after the embedding layer.
-
-    TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
-    """
-
-    def __init__(
-        self,
-        vocab_size: int,
-        decoder_dim: int,
-        blank_id: int,
-        context_size: int,
-    ):
-        """
-        Args:
-          vocab_size:
-            Number of tokens of the modeling unit including blank.
-          decoder_dim:
-            Dimension of the input embedding, and of the decoder output.
-          blank_id:
-            The ID of the blank symbol.
-          context_size:
-            Number of previous words to use to predict the next word.
-            1 means bigram; 2 means trigram. n means (n+1)-gram.
-        """
-        super().__init__()
-
-        self.embedding = ScaledEmbedding(
-            num_embeddings=vocab_size,
-            embedding_dim=decoder_dim,
-            padding_idx=blank_id,
-        )
-        self.blank_id = blank_id
-
-        assert context_size >= 1, context_size
-        self.context_size = context_size
-        self.vocab_size = vocab_size
-        if context_size > 1:
-            self.conv = ScaledConv1d(
-                in_channels=decoder_dim,
-                out_channels=decoder_dim,
-                kernel_size=context_size,
-                padding=0,
-                groups=decoder_dim,
-                bias=False,
-            )
-
-    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
-        """
-        Args:
-          y:
-            A 2-D tensor of shape (N, U).
-          need_pad:
-            True to left pad the input. Should be True during training.
-            False to not pad the input. Should be False during inference.
-        Returns:
-          Return a tensor of shape (N, U, decoder_dim).
-        """
-        y = y.to(torch.int64)
-        embedding_out = self.embedding(y)
-        if self.context_size > 1:
-            embedding_out = embedding_out.permute(0, 2, 1)
-            if need_pad is True:
-                embedding_out = F.pad(
-                    embedding_out, pad=(self.context_size - 1, 0)
-                )
-            else:
-                # During inference time, there is no need to do extra padding
-                # as we only need one output
-                assert embedding_out.size(-1) == self.context_size
-            embedding_out = self.conv(embedding_out)
-            embedding_out = embedding_out.permute(0, 2, 1)
-        embedding_out = F.relu(embedding_out)
-        return embedding_out
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/decoder.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/decoder.py
new file mode 120000
index 000000000..0793c5709
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/decoder.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/decoder.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py
index 00e8a2489..a30391d11 100644
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py
@@ -299,7 +299,7 @@ class ConvolutionModule(nn.Module):
         self,
         utterance: torch.Tensor,
         right_context: torch.Tensor,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Causal convolution module.
 
         Args:
@@ -362,12 +362,13 @@ class ConvolutionModule(nn.Module):
             right_context.permute(2, 0, 1),
         )
 
+    @torch.jit.export
     def infer(
         self,
         utterance: torch.Tensor,
         right_context: torch.Tensor,
-        cache: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+        cache: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Causal convolution module applied on both utterance and right_context.
 
         Args:
@@ -675,6 +676,7 @@ class EmformerAttention(nn.Module):
         )
         return output_right_context_utterance, output_memory[:-1]
 
+    @torch.jit.export
     def infer(
         self,
         utterance: torch.Tensor,
@@ -910,8 +912,8 @@ class EmformerEncoderLayer(nn.Module):
         self,
         right_context_utterance: torch.Tensor,
         R: int,
-        conv_cache: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+        conv_cache: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Apply convolution module on utterance in inference mode."""
         utterance = right_context_utterance[R:]
         right_context = right_context_utterance[:R]
@@ -1012,7 +1014,7 @@ class EmformerEncoderLayer(nn.Module):
         utterance: torch.Tensor,
         right_context: torch.Tensor,
         memory: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor,
         padding_mask: Optional[torch.Tensor] = None,
         warmup: float = 1.0,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -1085,6 +1087,7 @@ class EmformerEncoderLayer(nn.Module):
         output_right_context = src[:R]
         return output_utterance, output_right_context, output_memory
 
+    @torch.jit.export
     def infer(
         self,
         utterance: torch.Tensor,
@@ -1093,7 +1096,13 @@ class EmformerEncoderLayer(nn.Module):
         attn_cache: List[torch.Tensor],
         conv_cache: torch.Tensor,
         padding_mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]:
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        List[torch.Tensor],
+        torch.Tensor,
+    ]:
         """Forward pass for inference.
 
          B: batch size;
@@ -1478,16 +1487,17 @@ class EmformerEncoder(nn.Module):
 
         return output, output_lengths
 
+    @torch.jit.export
     def infer(
         self,
         x: torch.Tensor,
         lengths: torch.Tensor,
         num_processed_frames: torch.Tensor,
-        states: Tuple[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]],
+        states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]],
     ) -> Tuple[
         torch.Tensor,
         torch.Tensor,
-        Tuple[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]],
+        Tuple[List[List[torch.Tensor]], List[torch.Tensor]],
     ]:
         """Forward pass for streaming inference.
 
@@ -1565,14 +1575,12 @@ class EmformerEncoder(nn.Module):
 
         # calcualte padding mask to mask out initial zero caches
         chunk_mask = make_pad_mask(output_lengths).to(x.device)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            memory_mask = (
-                (num_processed_frames // self.chunk_length).view(x.size(1), 1)
-                <= torch.arange(self.memory_size, device=x.device).expand(
-                    x.size(1), self.memory_size
-                )
-            ).flip(1)
+        memory_mask = (
+            (num_processed_frames // self.chunk_length).view(x.size(1), 1)
+            <= torch.arange(self.memory_size, device=x.device).expand(
+                x.size(1), self.memory_size
+            )
+        ).flip(1)
         left_context_mask = (
             num_processed_frames.view(x.size(1), 1)
             <= torch.arange(self.left_context_length, device=x.device).expand(
@@ -1611,7 +1619,10 @@ class EmformerEncoder(nn.Module):
             output_attn_caches.append(output_attn_cache)
             output_conv_caches.append(output_conv_cache)
 
-        output_states = [output_attn_caches, output_conv_caches]
+        output_states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]] = (
+            output_attn_caches,
+            output_conv_caches,
+        )
         return output, output_lengths, output_states
 
 
@@ -1706,10 +1717,7 @@ class Emformer(EncoderInterface):
         x = self.encoder_embed(x)
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
 
-        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            x_lens = ((x_lens - 1) // 2 - 1) // 2
+        x_lens = (((x_lens - 1) >> 1) - 1) >> 1
         assert x.size(0) == x_lens.max().item()
 
         output, output_lengths = self.encoder(
@@ -1726,8 +1734,12 @@ class Emformer(EncoderInterface):
         x: torch.Tensor,
         x_lens: torch.Tensor,
         num_processed_frames: torch.Tensor,
-        states: Optional[List[List[torch.Tensor]]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]],
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        Tuple[List[List[torch.Tensor]], List[torch.Tensor]],
+    ]:
         """Forward pass for streaming inference.
 
         B: batch size;
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/encoder_interface.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/encoder_interface.py
deleted file mode 100644
index 257facce4..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/encoder_interface.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-import torch.nn as nn
-
-
-class EncoderInterface(nn.Module):
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-          x:
-            A tensor of shape (batch_size, input_seq_len, num_features)
-            containing the input features.
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames
-            in `x` before padding.
-        Returns:
-          Return a tuple containing two tensors:
-            - encoder_out, a tensor of (batch_size, out_seq_len, output_dim)
-              containing unnormalized probabilities, i.e., the output of a
-              linear layer.
-            - encoder_out_lens, a tensor of shape (batch_size,) containing
-              the number of frames in `encoder_out` before padding.
-        """
-        raise NotImplementedError("Please implement it in a subclass")
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/encoder_interface.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/encoder_interface.py
new file mode 120000
index 000000000..b9aa0ae08
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/encoder_interface.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/encoder_interface.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/joiner.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/joiner.py
deleted file mode 100644
index 35f75ed2a..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/joiner.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-from scaling import ScaledLinear
-
-
-class Joiner(nn.Module):
-    def __init__(
-        self,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        super().__init__()
-
-        self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim)
-        self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim)
-        self.output_linear = ScaledLinear(joiner_dim, vocab_size)
-
-    def forward(
-        self,
-        encoder_out: torch.Tensor,
-        decoder_out: torch.Tensor,
-        project_input: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-          encoder_out:
-            Output from the encoder. Its shape is (N, T, s_range, C).
-          decoder_out:
-            Output from the decoder. Its shape is (N, T, s_range, C).
-           project_input:
-            If true, apply input projections encoder_proj and decoder_proj.
-            If this is false, it is the user's responsibility to do this
-            manually.
-        Returns:
-          Return a tensor of shape (N, T, s_range, C).
-        """
-        assert encoder_out.ndim == decoder_out.ndim == 4
-        assert encoder_out.shape[:-1] == decoder_out.shape[:-1]
-
-        if project_input:
-            logit = self.encoder_proj(encoder_out) + self.decoder_proj(
-                decoder_out
-            )
-        else:
-            logit = encoder_out + decoder_out
-
-        logit = self.output_linear(torch.tanh(logit))
-
-        return logit
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/joiner.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/joiner.py
new file mode 120000
index 000000000..815fd4bb6
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/joiner.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/joiner.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/model.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/model.py
deleted file mode 100644
index 599bf2506..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/model.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import k2
-import torch
-import torch.nn as nn
-from encoder_interface import EncoderInterface
-from scaling import ScaledLinear
-
-from icefall.utils import add_sos
-
-
-class Transducer(nn.Module):
-    """It implements https://arxiv.org/pdf/1211.3711.pdf
-    "Sequence Transduction with Recurrent Neural Networks"
-    """
-
-    def __init__(
-        self,
-        encoder: EncoderInterface,
-        decoder: nn.Module,
-        joiner: nn.Module,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        """
-        Args:
-          encoder:
-            It is the transcription network in the paper. Its accepts
-            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
-            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
-            `logit_lens` of shape (N,).
-          decoder:
-            It is the prediction network in the paper. Its input shape
-            is (N, U) and its output shape is (N, U, decoder_dim).
-            It should contain one attribute: `blank_id`.
-          joiner:
-            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
-            Its output shape is (N, T, U, vocab_size). Note that its output contains
-            unnormalized probs, i.e., not processed by log-softmax.
-        """
-        super().__init__()
-        assert isinstance(encoder, EncoderInterface), type(encoder)
-        assert hasattr(decoder, "blank_id")
-
-        self.encoder = encoder
-        self.decoder = decoder
-        self.joiner = joiner
-
-        self.simple_am_proj = ScaledLinear(
-            encoder_dim, vocab_size, initial_speed=0.5
-        )
-        self.simple_lm_proj = ScaledLinear(decoder_dim, vocab_size)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: k2.RaggedTensor,
-        prune_range: int = 5,
-        am_scale: float = 0.0,
-        lm_scale: float = 0.0,
-        warmup: float = 1.0,
-    ) -> torch.Tensor:
-        """
-        Args:
-          x:
-            A 3-D tensor of shape (N, T, C).
-          x_lens:
-            A 1-D tensor of shape (N,). It contains the number of frames in `x`
-            before padding.
-          y:
-            A ragged tensor with 2 axes [utt][label]. It contains labels of each
-            utterance.
-          prune_range:
-            The prune range for rnnt loss, it means how many symbols(context)
-            we are considering for each frame to compute the loss.
-          am_scale:
-            The scale to smooth the loss with am (output of encoder network)
-            part
-          lm_scale:
-            The scale to smooth the loss with lm (output of predictor network)
-            part
-          warmup:
-            A value warmup >= 0 that determines which modules are active, values
-            warmup > 1 "are fully warmed up" and all modules will be active.
-        Returns:
-          Return the transducer loss.
-
-        Note:
-           Regarding am_scale & lm_scale, it will make the loss-function one of
-           the form:
-              lm_scale * lm_probs + am_scale * am_probs +
-              (1-lm_scale-am_scale) * combined_probs
-        """
-        assert x.ndim == 3, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert y.num_axes == 2, y.num_axes
-
-        assert x.size(0) == x_lens.size(0) == y.dim0
-
-        encoder_out, x_lens = self.encoder(x, x_lens, warmup=warmup)
-        assert torch.all(x_lens > 0)
-
-        # Now for the decoder, i.e., the prediction network
-        row_splits = y.shape.row_splits(1)
-        y_lens = row_splits[1:] - row_splits[:-1]
-
-        blank_id = self.decoder.blank_id
-        sos_y = add_sos(y, sos_id=blank_id)
-
-        # sos_y_padded: [B, S + 1], start with SOS.
-        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
-
-        # decoder_out: [B, S + 1, decoder_dim]
-        decoder_out = self.decoder(sos_y_padded)
-
-        # Note: y does not start with SOS
-        # y_padded : [B, S]
-        y_padded = y.pad(mode="constant", padding_value=0)
-
-        y_padded = y_padded.to(torch.int64)
-        boundary = torch.zeros(
-            (x.size(0), 4), dtype=torch.int64, device=x.device
-        )
-        boundary[:, 2] = y_lens
-        boundary[:, 3] = x_lens
-
-        lm = self.simple_lm_proj(decoder_out)
-        am = self.simple_am_proj(encoder_out)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
-                lm=lm.float(),
-                am=am.float(),
-                symbols=y_padded,
-                termination_symbol=blank_id,
-                lm_only_scale=lm_scale,
-                am_only_scale=am_scale,
-                boundary=boundary,
-                reduction="sum",
-                return_grad=True,
-            )
-
-        # ranges : [B, T, prune_range]
-        ranges = k2.get_rnnt_prune_ranges(
-            px_grad=px_grad,
-            py_grad=py_grad,
-            boundary=boundary,
-            s_range=prune_range,
-        )
-
-        # am_pruned : [B, T, prune_range, encoder_dim]
-        # lm_pruned : [B, T, prune_range, decoder_dim]
-        am_pruned, lm_pruned = k2.do_rnnt_pruning(
-            am=self.joiner.encoder_proj(encoder_out),
-            lm=self.joiner.decoder_proj(decoder_out),
-            ranges=ranges,
-        )
-
-        # logits : [B, T, prune_range, vocab_size]
-
-        # project_input=False since we applied the decoder's input projections
-        # prior to do_rnnt_pruning (this is an optimization for speed).
-        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            pruned_loss = k2.rnnt_loss_pruned(
-                logits=logits.float(),
-                symbols=y_padded,
-                ranges=ranges,
-                termination_symbol=blank_id,
-                boundary=boundary,
-                reduction="sum",
-            )
-
-        return (simple_loss, pruned_loss)
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/model.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/model.py
new file mode 120000
index 000000000..ebb6d774d
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/model.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/model.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/optim.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/optim.py
deleted file mode 100644
index 432bf8220..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/optim.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import List, Optional, Union
-
-import torch
-from torch.optim import Optimizer
-
-
-class Eve(Optimizer):
-    r"""
-    Implements Eve algorithm.  This is a modified version of AdamW with a special
-    way of setting the weight-decay / shrinkage-factor, which is designed to make the
-    rms of the parameters approach a particular target_rms (default: 0.1).  This is
-    for use with networks with 'scaled' versions of modules (see scaling.py), which
-    will be close to invariant to the absolute scale on the parameter matrix.
-
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-    Eve is unpublished so far.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
-            this value means that the weight would decay significantly after
-            about 3k minibatches.  Is not multiplied by learning rate, but
-            is conditional on RMS-value of parameter being > target_rms.
-        target_rms (float, optional): target root-mean-square value of
-           parameters, if they fall below this we will stop applying weight decay.
-
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.98),
-        eps=1e-8,
-        weight_decay=1e-3,
-        target_rms=0.1,
-    ):
-
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
-        if not 0 <= weight_decay <= 0.1:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
-        if not 0 < target_rms <= 10.0:
-            raise ValueError("Invalid target_rms value: {}".format(target_rms))
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            target_rms=target_rms,
-        )
-        super(Eve, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Eve, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-
-                # Perform optimization step
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        "AdamW does not support sparse gradients"
-                    )
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                denom = (exp_avg_sq.sqrt() * (bias_correction2 ** -0.5)).add_(
-                    group["eps"]
-                )
-
-                step_size = group["lr"] / bias_correction1
-                target_rms = group["target_rms"]
-                weight_decay = group["weight_decay"]
-
-                if p.numel() > 1:
-                    # avoid applying this weight-decay on "scaling factors"
-                    # (which are scalar).
-                    is_above_target_rms = p.norm() > (
-                        target_rms * (p.numel() ** 0.5)
-                    )
-                    p.mul_(1 - (weight_decay * is_above_target_rms))
-                p.addcdiv_(exp_avg, denom, value=-step_size)
-
-        return loss
-
-
-class LRScheduler(object):
-    """
-    Base-class for learning rate schedulers where the learning-rate depends on both the
-    batch and the epoch.
-    """
-
-    def __init__(self, optimizer: Optimizer, verbose: bool = False):
-        # Attach optimizer
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError(
-                "{} is not an Optimizer".format(type(optimizer).__name__)
-            )
-        self.optimizer = optimizer
-        self.verbose = verbose
-
-        for group in optimizer.param_groups:
-            group.setdefault("initial_lr", group["lr"])
-
-        self.base_lrs = [
-            group["initial_lr"] for group in optimizer.param_groups
-        ]
-
-        self.epoch = 0
-        self.batch = 0
-
-    def state_dict(self):
-        """Returns the state of the scheduler as a :class:`dict`.
-
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
-        """
-        return {
-            "base_lrs": self.base_lrs,
-            "epoch": self.epoch,
-            "batch": self.batch,
-        }
-
-    def load_state_dict(self, state_dict):
-        """Loads the schedulers state.
-
-        Args:
-            state_dict (dict): scheduler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-
-    def get_last_lr(self) -> List[float]:
-        """Return last computed learning rate by current scheduler.  Will be a list of float."""
-        return self._last_lr
-
-    def get_lr(self):
-        # Compute list of learning rates from self.epoch and self.batch and
-        # self.base_lrs; this must be overloaded by the user.
-        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
-        raise NotImplementedError
-
-    def step_batch(self, batch: Optional[int] = None) -> None:
-        # Step the batch index, or just set it.  If `batch` is specified, it
-        # must be the batch index from the start of training, i.e. summed over
-        # all epochs.
-        # You can call this in any order; if you don't provide 'batch', it should
-        # of course be called once per batch.
-        if batch is not None:
-            self.batch = batch
-        else:
-            self.batch = self.batch + 1
-        self._set_lrs()
-
-    def step_epoch(self, epoch: Optional[int] = None):
-        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
-        # you should call this at the start of the epoch; if you don't provide the 'epoch'
-        # arg, you should call it at the end of the epoch.
-        if epoch is not None:
-            self.epoch = epoch
-        else:
-            self.epoch = self.epoch + 1
-        self._set_lrs()
-
-    def _set_lrs(self):
-        values = self.get_lr()
-        assert len(values) == len(self.optimizer.param_groups)
-
-        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
-            param_group, lr = data
-            param_group["lr"] = lr
-            self.print_lr(self.verbose, i, lr)
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
-
-    def print_lr(self, is_verbose, group, lr):
-        """Display the current learning rate."""
-        if is_verbose:
-            print(
-                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
-                f" of group {group} to {lr:.4e}."
-            )
-
-
-class Eden(LRScheduler):
-    """
-    Eden scheduler.
-     lr = initial_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
-                       (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25))
-
-     E.g. suggest initial-lr = 0.003 (passed to optimizer).
-
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-        lr_epochs: the number of epochs after which we start significantly
-              decreasing the learning rate, suggest 6 if you plan to do e.g.
-              20 to 40 epochs, but may need smaller number if dataset is huge
-              and you will do few epochs.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        lr_epochs: Union[int, float],
-        verbose: bool = False,
-    ):
-        super(Eden, self).__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.lr_epochs = lr_epochs
-
-    def get_lr(self):
-        factor = (
-            (self.batch ** 2 + self.lr_batches ** 2) / self.lr_batches ** 2
-        ) ** -0.25 * (
-            ((self.epoch ** 2 + self.lr_epochs ** 2) / self.lr_epochs ** 2)
-            ** -0.25
-        )
-        return [x * factor for x in self.base_lrs]
-
-
-def _test_eden():
-    m = torch.nn.Linear(100, 100)
-    optim = Eve(m.parameters(), lr=0.003)
-
-    scheduler = Eden(optim, lr_batches=30, lr_epochs=2, verbose=True)
-
-    for epoch in range(10):
-        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
-
-        for step in range(20):
-            x = torch.randn(200, 100).detach()
-            x.requires_grad = True
-            y = m(x)
-            dy = torch.randn(200, 100).detach()
-            f = (y * dy).sum()
-            f.backward()
-
-            optim.step()
-            scheduler.step_batch()
-            optim.zero_grad()
-    print("last lr = ", scheduler.get_last_lr())
-    print("state dict = ", scheduler.state_dict())
-
-
-if __name__ == "__main__":
-    _test_eden()
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/optim.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/optim.py
new file mode 120000
index 000000000..e2deb4492
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/optim.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/optim.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/scaling.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/scaling.py
deleted file mode 100644
index f89d2963e..000000000
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/scaling.py
+++ /dev/null
@@ -1,702 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import collections
-from itertools import repeat
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.Iterable):
-            return x
-        return tuple(repeat(x, n))
-
-    return parse
-
-
-_single = _ntuple(1)
-_pair = _ntuple(2)
-
-
-class ActivationBalancerFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        channel_dim: int,
-        min_positive: float,  # e.g. 0.05
-        max_positive: float,  # e.g. 0.95
-        max_factor: float,  # e.g. 0.01
-        min_abs: float,  # e.g. 0.2
-        max_abs: float,  # e.g. 100.0
-    ) -> Tensor:
-        if x.requires_grad:
-            if channel_dim < 0:
-                channel_dim += x.ndim
-            sum_dims = [d for d in range(x.ndim) if d != channel_dim]
-            xgt0 = x > 0
-            proportion_positive = torch.mean(
-                xgt0.to(x.dtype), dim=sum_dims, keepdim=True
-            )
-            factor1 = (
-                (min_positive - proportion_positive).relu()
-                * (max_factor / min_positive)
-                if min_positive != 0.0
-                else 0.0
-            )
-            factor2 = (
-                (proportion_positive - max_positive).relu()
-                * (max_factor / (max_positive - 1.0))
-                if max_positive != 1.0
-                else 0.0
-            )
-            factor = factor1 + factor2
-            if isinstance(factor, float):
-                factor = torch.zeros_like(proportion_positive)
-
-            mean_abs = torch.mean(x.abs(), dim=sum_dims, keepdim=True)
-            below_threshold = mean_abs < min_abs
-            above_threshold = mean_abs > max_abs
-
-            ctx.save_for_backward(
-                factor, xgt0, below_threshold, above_threshold
-            )
-            ctx.max_factor = max_factor
-            ctx.sum_dims = sum_dims
-        return x
-
-    @staticmethod
-    def backward(
-        ctx, x_grad: Tensor
-    ) -> Tuple[Tensor, None, None, None, None, None, None]:
-        factor, xgt0, below_threshold, above_threshold = ctx.saved_tensors
-        dtype = x_grad.dtype
-        scale_factor = (
-            (below_threshold.to(dtype) - above_threshold.to(dtype))
-            * (xgt0.to(dtype) - 0.5)
-            * (ctx.max_factor * 2.0)
-        )
-
-        neg_delta_grad = x_grad.abs() * (factor + scale_factor)
-        return x_grad - neg_delta_grad, None, None, None, None, None, None
-
-
-class BasicNorm(torch.nn.Module):
-    """
-    This is intended to be a simpler, and hopefully cheaper, replacement for
-    LayerNorm.  The observation this is based on, is that Transformer-type
-    networks, especially with pre-norm, sometimes seem to set one of the
-    feature dimensions to a large constant value (e.g. 50), which "defeats"
-    the LayerNorm because the output magnitude is then not strongly dependent
-    on the other (useful) features.  Presumably the weight and bias of the
-    LayerNorm are required to allow it to do this.
-
-    So the idea is to introduce this large constant value as an explicit
-    parameter, that takes the role of the "eps" in LayerNorm, so the network
-    doesn't have to do this trick.  We make the "eps" learnable.
-
-    Args:
-       num_channels: the number of channels, e.g. 512.
-      channel_dim: the axis/dimension corresponding to the channel,
-        interprted as an offset from the input's ndim if negative.
-        shis is NOT the num_channels; it should typically be one of
-        {-2, -1, 0, 1, 2, 3}.
-       eps: the initial "epsilon" that we add as ballast in:
-             scale = ((input_vec**2).mean() + epsilon)**-0.5
-          Note: our epsilon is actually large, but we keep the name
-          to indicate the connection with conventional LayerNorm.
-       learn_eps: if true, we learn epsilon; if false, we keep it
-         at the initial value.
-    """
-
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int = -1,  # CAUTION: see documentation.
-        eps: float = 0.25,
-        learn_eps: bool = True,
-    ) -> None:
-        super(BasicNorm, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        if learn_eps:
-            self.eps = nn.Parameter(torch.tensor(eps).log().detach())
-        else:
-            self.register_buffer("eps", torch.tensor(eps).log().detach())
-
-    def forward(self, x: Tensor) -> Tensor:
-        assert x.shape[self.channel_dim] == self.num_channels
-        scales = (
-            torch.mean(x ** 2, dim=self.channel_dim, keepdim=True)
-            + self.eps.exp()
-        ) ** -0.5
-        return x * scales
-
-
-class ScaledLinear(nn.Linear):
-    """
-    A modified version of nn.Linear where the parameters are scaled before
-    use, via:
-         weight = self.weight * self.weight_scale.exp()
-         bias = self.bias * self.bias_scale.exp()
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-        initial_speed: this affects how fast the parameter will
-           learn near the start of training; you can set it to a
-           value less than one if you suspect that a module
-           is contributing to instability near the start of training.
-           Nnote: regardless of the use of this option, it's best to
-           use schedulers like Noam that have a warm-up period.
-           Alternatively you can set it to more than 1 if you want it to
-           initially train faster.   Must be greater than 0.
-    """
-
-    def __init__(
-        self,
-        *args,
-        initial_scale: float = 1.0,
-        initial_speed: float = 1.0,
-        **kwargs
-    ):
-        super(ScaledLinear, self).__init__(*args, **kwargs)
-        initial_scale = torch.tensor(initial_scale).log()
-        self.weight_scale = nn.Parameter(initial_scale.clone().detach())
-        if self.bias is not None:
-            self.bias_scale = nn.Parameter(initial_scale.clone().detach())
-        else:
-            self.register_parameter("bias_scale", None)
-
-        self._reset_parameters(
-            initial_speed
-        )  # Overrides the reset_parameters in nn.Linear
-
-    def _reset_parameters(self, initial_speed: float):
-        std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
-        nn.init.uniform_(self.weight, -a, a)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-        fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
-        with torch.no_grad():
-            self.weight_scale += torch.tensor(scale / std).log()
-
-    def get_weight(self):
-        return self.weight * self.weight_scale.exp()
-
-    def get_bias(self):
-        return None if self.bias is None else self.bias * self.bias_scale.exp()
-
-    def forward(self, input: Tensor) -> Tensor:
-        return torch.nn.functional.linear(
-            input, self.get_weight(), self.get_bias()
-        )
-
-
-class ScaledConv1d(nn.Conv1d):
-    # See docs for ScaledLinear
-    def __init__(
-        self,
-        *args,
-        initial_scale: float = 1.0,
-        initial_speed: float = 1.0,
-        **kwargs
-    ):
-        super(ScaledConv1d, self).__init__(*args, **kwargs)
-        initial_scale = torch.tensor(initial_scale).log()
-        self.weight_scale = nn.Parameter(initial_scale.clone().detach())
-        if self.bias is not None:
-            self.bias_scale = nn.Parameter(initial_scale.clone().detach())
-        else:
-            self.register_parameter("bias_scale", None)
-        self._reset_parameters(
-            initial_speed
-        )  # Overrides the reset_parameters in base class
-
-    def _reset_parameters(self, initial_speed: float):
-        std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
-        nn.init.uniform_(self.weight, -a, a)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-        fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
-        with torch.no_grad():
-            self.weight_scale += torch.tensor(scale / std).log()
-
-    def get_weight(self):
-        return self.weight * self.weight_scale.exp()
-
-    def get_bias(self):
-        return None if self.bias is None else self.bias * self.bias_scale.exp()
-
-    def forward(self, input: Tensor) -> Tensor:
-        F = torch.nn.functional
-        if self.padding_mode != "zeros":
-            return F.conv1d(
-                F.pad(
-                    input,
-                    self._reversed_padding_repeated_twice,
-                    mode=self.padding_mode,
-                ),
-                self.get_weight(),
-                self.get_bias(),
-                self.stride,
-                _single(0),
-                self.dilation,
-                self.groups,
-            )
-        return F.conv1d(
-            input,
-            self.get_weight(),
-            self.get_bias(),
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-        )
-
-
-class ScaledConv2d(nn.Conv2d):
-    # See docs for ScaledLinear
-    def __init__(
-        self,
-        *args,
-        initial_scale: float = 1.0,
-        initial_speed: float = 1.0,
-        **kwargs
-    ):
-        super(ScaledConv2d, self).__init__(*args, **kwargs)
-        initial_scale = torch.tensor(initial_scale).log()
-        self.weight_scale = nn.Parameter(initial_scale.clone().detach())
-        if self.bias is not None:
-            self.bias_scale = nn.Parameter(initial_scale.clone().detach())
-        else:
-            self.register_parameter("bias_scale", None)
-        self._reset_parameters(
-            initial_speed
-        )  # Overrides the reset_parameters in base class
-
-    def _reset_parameters(self, initial_speed: float):
-        std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
-        nn.init.uniform_(self.weight, -a, a)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-        fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
-        with torch.no_grad():
-            self.weight_scale += torch.tensor(scale / std).log()
-
-    def get_weight(self):
-        return self.weight * self.weight_scale.exp()
-
-    def get_bias(self):
-        return None if self.bias is None else self.bias * self.bias_scale.exp()
-
-    def _conv_forward(self, input, weight):
-        F = torch.nn.functional
-        if self.padding_mode != "zeros":
-            return F.conv2d(
-                F.pad(
-                    input,
-                    self._reversed_padding_repeated_twice,
-                    mode=self.padding_mode,
-                ),
-                weight,
-                self.get_bias(),
-                self.stride,
-                _pair(0),
-                self.dilation,
-                self.groups,
-            )
-        return F.conv2d(
-            input,
-            weight,
-            self.get_bias(),
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-        )
-
-    def forward(self, input: Tensor) -> Tensor:
-        return self._conv_forward(input, self.get_weight())
-
-
-class ActivationBalancer(torch.nn.Module):
-    """
-    Modifies the backpropped derivatives of a function to try to encourage, for
-    each channel, that it is positive at least a proportion `threshold` of the
-    time.  It does this by multiplying negative derivative values by up to
-    (1+max_factor), and positive derivative values by up to (1-max_factor),
-    interpolated from 1 at the threshold to those extremal values when none
-    of the inputs are positive.
-
-
-    Args:
-           channel_dim: the dimension/axis corresponding to the channel, e.g.
-               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
-           min_positive: the minimum, per channel, of the proportion of the time
-               that (x > 0), below which we start to modify the derivatives.
-           max_positive: the maximum, per channel, of the proportion of the time
-               that (x > 0), above which we start to modify the derivatives.
-           max_factor: the maximum factor by which we modify the derivatives for
-              either the sign constraint or the magnitude constraint;
-              e.g. with max_factor=0.02, the the derivatives would be multiplied by
-              values in the range [0.98..1.02].
-           min_abs:  the minimum average-absolute-value per channel, which
-              we allow, before we start to modify the derivatives to prevent
-              this.
-           max_abs:  the maximum average-absolute-value per channel, which
-               we allow, before we start to modify the derivatives to prevent
-               this.
-    """
-
-    def __init__(
-        self,
-        channel_dim: int,
-        min_positive: float = 0.05,
-        max_positive: float = 0.95,
-        max_factor: float = 0.01,
-        min_abs: float = 0.2,
-        max_abs: float = 100.0,
-    ):
-        super(ActivationBalancer, self).__init__()
-        self.channel_dim = channel_dim
-        self.min_positive = min_positive
-        self.max_positive = max_positive
-        self.max_factor = max_factor
-        self.min_abs = min_abs
-        self.max_abs = max_abs
-
-    def forward(self, x: Tensor) -> Tensor:
-        return ActivationBalancerFunction.apply(
-            x,
-            self.channel_dim,
-            self.min_positive,
-            self.max_positive,
-            self.max_factor,
-            self.min_abs,
-            self.max_abs,
-        )
-
-
-class DoubleSwishFunction(torch.autograd.Function):
-    """
-      double_swish(x) = x * torch.sigmoid(x-1)
-    This is a definition, originally motivated by its close numerical
-    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
-
-    Memory-efficient derivative computation:
-     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
-     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
-     Now, s'(x) = s(x) * (1-s(x)).
-     double_swish'(x) =  x * s'(x) + s(x).
-                      =  x * s(x) * (1-s(x)) + s(x).
-                     = double_swish(x) * (1-s(x)) + s(x)
-     ... so we just need to remember s(x) but not x itself.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        x = x.detach()
-        s = torch.sigmoid(x - 1.0)
-        y = x * s
-        ctx.save_for_backward(s, y)
-        return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        s, y = ctx.saved_tensors
-        return (y * (1 - s) + s) * y_grad
-
-
-class DoubleSwish(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
-        that we approximate closely with x * sigmoid(x-1).
-        """
-        return DoubleSwishFunction.apply(x)
-
-
-class ScaledEmbedding(nn.Module):
-    r"""This is a modified version of nn.Embedding that introduces a learnable scale
-    on the parameters.  Note: due to how we initialize it, it's best used with
-    schedulers like Noam that have a warmup period.
-
-    It is a simple lookup table that stores embeddings of a fixed dictionary and size.
-
-    This module is often used to store word embeddings and retrieve them using indices.
-    The input to the module is a list of indices, and the output is the corresponding
-    word embeddings.
-
-    Args:
-        num_embeddings (int): size of the dictionary of embeddings
-        embedding_dim (int): the size of each embedding vector
-        padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
-                                         (initialized to zeros) whenever it encounters the index.
-        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
-                                    is renormalized to have norm :attr:`max_norm`.
-        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
-        scale_grad_by_freq (boolean, optional): If given, this will scale gradients by the inverse of frequency of
-                                                the words in the mini-batch. Default ``False``.
-        sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
-                                 See Notes for more details regarding sparse gradients.
-
-        initial_speed (float, optional):  This affects how fast the parameter will
-           learn near the start of training; you can set it to a value less than
-           one if you suspect that a module is contributing to instability near
-           the start of training.  Nnote: regardless of the use of this option,
-           it's best to use schedulers like Noam that have a warm-up period.
-           Alternatively you can set it to more than 1 if you want it to
-           initially train faster.  Must be greater than 0.
-
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
-                         initialized from :math:`\mathcal{N}(0, 1)`
-
-    Shape:
-        - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
-        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
-
-    .. note::
-        Keep in mind that only a limited number of optimizers support
-        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
-        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
-
-    .. note::
-        With :attr:`padding_idx` set, the embedding vector at
-        :attr:`padding_idx` is initialized to all zeros. However, note that this
-        vector can be modified afterwards, e.g., using a customized
-        initialization method, and thus changing the vector used to pad the
-        output. The gradient for this vector from :class:`~torch.nn.Embedding`
-        is always zero.
-
-    Examples::
-
-        >>> # an Embedding module containing 10 tensors of size 3
-        >>> embedding = nn.Embedding(10, 3)
-        >>> # a batch of 2 samples of 4 indices each
-        >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
-        >>> embedding(input)
-        tensor([[[-0.0251, -1.6902,  0.7172],
-                 [-0.6431,  0.0748,  0.6969],
-                 [ 1.4970,  1.3448, -0.9685],
-                 [-0.3677, -2.7265, -0.1685]],
-
-                [[ 1.4970,  1.3448, -0.9685],
-                 [ 0.4362, -0.4004,  0.9400],
-                 [-0.6431,  0.0748,  0.6969],
-                 [ 0.9124, -2.3616,  1.1151]]])
-
-
-        >>> # example with padding_idx
-        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
-        >>> input = torch.LongTensor([[0,2,0,5]])
-        >>> embedding(input)
-        tensor([[[ 0.0000,  0.0000,  0.0000],
-                 [ 0.1535, -2.0309,  0.9315],
-                 [ 0.0000,  0.0000,  0.0000],
-                 [-0.1655,  0.9897,  0.0635]]])
-
-    """
-    __constants__ = [
-        "num_embeddings",
-        "embedding_dim",
-        "padding_idx",
-        "scale_grad_by_freq",
-        "sparse",
-    ]
-
-    num_embeddings: int
-    embedding_dim: int
-    padding_idx: int
-    scale_grad_by_freq: bool
-    weight: Tensor
-    sparse: bool
-
-    def __init__(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        padding_idx: Optional[int] = None,
-        scale_grad_by_freq: bool = False,
-        sparse: bool = False,
-        initial_speed: float = 1.0,
-    ) -> None:
-        super(ScaledEmbedding, self).__init__()
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if padding_idx is not None:
-            if padding_idx > 0:
-                assert (
-                    padding_idx < self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-            elif padding_idx < 0:
-                assert (
-                    padding_idx >= -self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-                padding_idx = self.num_embeddings + padding_idx
-        self.padding_idx = padding_idx
-        self.scale_grad_by_freq = scale_grad_by_freq
-
-        self.scale = nn.Parameter(torch.zeros(()))  # see reset_parameters()
-        self.sparse = sparse
-
-        self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim))
-        self.reset_parameters(initial_speed)
-
-    def reset_parameters(self, initial_speed: float = 1.0) -> None:
-        std = 0.1 / initial_speed
-        nn.init.normal_(self.weight, std=std)
-        nn.init.constant_(self.scale, torch.tensor(1.0 / std).log())
-
-        if self.padding_idx is not None:
-            with torch.no_grad():
-                self.weight[self.padding_idx].fill_(0)
-
-    def forward(self, input: Tensor) -> Tensor:
-        F = torch.nn.functional
-        scale = self.scale.exp()
-        if input.numel() < self.num_embeddings:
-            return (
-                F.embedding(
-                    input,
-                    self.weight,
-                    self.padding_idx,
-                    None,
-                    2.0,  # None, 2.0 relate to normalization
-                    self.scale_grad_by_freq,
-                    self.sparse,
-                )
-                * scale
-            )
-        else:
-            return F.embedding(
-                input,
-                self.weight * scale,
-                self.padding_idx,
-                None,
-                2.0,  # None, 2.0 relates to normalization
-                self.scale_grad_by_freq,
-                self.sparse,
-            )
-
-    def extra_repr(self) -> str:
-        s = "{num_embeddings}, {embedding_dim}, scale={scale}"
-        if self.padding_idx is not None:
-            s += ", padding_idx={padding_idx}"
-        if self.scale_grad_by_freq is not False:
-            s += ", scale_grad_by_freq={scale_grad_by_freq}"
-        if self.sparse is not False:
-            s += ", sparse=True"
-        return s.format(**self.__dict__)
-
-
-def _test_activation_balancer_sign():
-    probs = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = 1.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))
-    x = x.detach()
-    x.requires_grad = True
-    m = ActivationBalancer(
-        channel_dim=0,
-        min_positive=0.05,
-        max_positive=0.95,
-        max_factor=0.2,
-        min_abs=0.0,
-    )
-
-    y_grad = torch.sign(torch.randn(probs.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_activation_balancer_sign: x = ", x)
-    print("_test_activation_balancer_sign: y grad = ", y_grad)
-    print("_test_activation_balancer_sign: x grad = ", x.grad)
-
-
-def _test_activation_balancer_magnitude():
-    magnitudes = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(
-        -1
-    )
-    x = x.detach()
-    x.requires_grad = True
-    m = ActivationBalancer(
-        channel_dim=0,
-        min_positive=0.0,
-        max_positive=1.0,
-        max_factor=0.2,
-        min_abs=0.2,
-        max_abs=0.8,
-    )
-
-    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_activation_balancer_magnitude: x = ", x)
-    print("_test_activation_balancer_magnitude: y grad = ", y_grad)
-    print("_test_activation_balancer_magnitude: x grad = ", x.grad)
-
-
-def _test_basic_norm():
-    num_channels = 128
-    m = BasicNorm(num_channels=num_channels, channel_dim=1)
-
-    x = torch.randn(500, num_channels)
-
-    y = m(x)
-
-    assert y.shape == x.shape
-    x_rms = (x ** 2).mean().sqrt()
-    y_rms = (y ** 2).mean().sqrt()
-    print("x rms = ", x_rms)
-    print("y rms = ", y_rms)
-    assert y_rms < x_rms
-    assert y_rms > 0.5 * x_rms
-
-
-def _test_double_swish_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 0.5
-    x.requires_grad = True
-    m = DoubleSwish()
-    torch.autograd.gradcheck(m, x)
-
-
-if __name__ == "__main__":
-    _test_activation_balancer_sign()
-    _test_activation_balancer_magnitude()
-    _test_basic_norm()
-    _test_double_swish_deriv()
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/scaling.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/scaling.py
new file mode 120000
index 000000000..09d802cc4
--- /dev/null
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/scaling.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/scaling.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/test_emformer.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/test_emformer.py
index 7d55d8e1e..a7c43d11d 100644
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/test_emformer.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/test_emformer.py
@@ -570,6 +570,63 @@ def test_state_stack_unstack():
         assert torch.allclose(s, s2), f"{s.sum()}, {s2.sum()}"
 
 
+def test_torchscript_consistency_infer():
+    r"""Verify that scripting Emformer does not change the behavior of method `infer`."""
+    from emformer import Emformer
+
+    num_features = 80
+    chunk_length = 32
+    encoder_dim = 512
+    num_encoder_layers = 2
+    kernel_size = 31
+    left_context_length = 32
+    right_context_length = 8
+    memory_size = 32
+    batch_size = 2
+
+    model = Emformer(
+        num_features=num_features,
+        chunk_length=chunk_length,
+        subsampling_factor=4,
+        d_model=encoder_dim,
+        num_encoder_layers=num_encoder_layers,
+        cnn_module_kernel=kernel_size,
+        left_context_length=left_context_length,
+        right_context_length=right_context_length,
+        memory_size=memory_size,
+    ).eval()
+    attn_caches = [
+        [
+            torch.zeros(memory_size, batch_size, encoder_dim),
+            torch.zeros(left_context_length // 4, batch_size, encoder_dim),
+            torch.zeros(
+                left_context_length // 4,
+                batch_size,
+                encoder_dim,
+            ),
+        ]
+        for _ in range(num_encoder_layers)
+    ]
+    conv_caches = [
+        torch.zeros(batch_size, encoder_dim, kernel_size - 1)
+        for _ in range(num_encoder_layers)
+    ]
+    states = [attn_caches, conv_caches]
+    x = torch.randn(batch_size, 23, num_features)
+    x_lens = torch.full((batch_size,), 23)
+    num_processed_frames = torch.full((batch_size,), 0)
+    y, y_lens, out_states = model.infer(
+        x, x_lens, num_processed_frames=num_processed_frames, states=states
+    )
+
+    sc_model = torch.jit.script(model).eval()
+    sc_y, sc_y_lens, sc_out_states = sc_model.infer(
+        x, x_lens, num_processed_frames=num_processed_frames, states=states
+    )
+
+    assert torch.allclose(y, sc_y)
+
+
 if __name__ == "__main__":
     # test_emformer_attention_forward()
     # test_emformer_attention_infer()
@@ -582,4 +639,5 @@ if __name__ == "__main__":
     # test_emformer_encoder_forward_infer_consistency()
     # test_emformer_forward()
     # test_emformer_infer()
-    test_state_stack_unstack()
+    # test_state_stack_unstack()
+    test_torchscript_consistency_infer()
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
index ce8b04afd..7c936b257 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@@ -557,7 +557,7 @@ class HypothesisList(object):
         return ", ".join(s)
 
 
-def _get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
+def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
     """Return a ragged shape with axes [utt][num_hyps].
 
     Args:
@@ -648,7 +648,7 @@ def modified_beam_search(
         finalized_B = B[batch_size:] + finalized_B
         B = B[:batch_size]
 
-        hyps_shape = _get_hyps_shape(B).to(device)
+        hyps_shape = get_hyps_shape(B).to(device)
 
         A = [list(b) for b in B]
         B = [HypothesisList() for _ in range(batch_size)]