clean commit for SURT recipe

2023-06-11 16:32:29 -04:00 · 2023-06-11 16:32:29 -04:00 · 42daafee4e
commit 42daafee4e
parent 674c9af713
25 changed files with 267 additions and 12646 deletions
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/asr_datamodule.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/asr_datamodule.py
@ -1,372 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import inspect
-import logging
-from functools import lru_cache
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional
-
-import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
-from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
-    CutMix,
-    DynamicBucketingSampler,
-    K2SurtDataset,
-    PrecomputedFeatures,
-    SimpleCutSampler,
-    SpecAugment,
-)
-from lhotse.dataset.input_strategies import OnTheFlyFeatures
-from lhotse.utils import fix_random_seed
-from torch.utils.data import DataLoader
-
-from icefall.utils import str2bool
-
-
-class _SeedWorkers:
-    def __init__(self, seed: int):
-        self.seed = seed
-
-    def __call__(self, worker_id: int):
-        fix_random_seed(self.seed + worker_id)
-
-
-class LibrimixAsrDataModule:
-    """
-    DataModule for k2 ASR experiments.
-    It assumes there is always one train and valid dataloader,
-    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
-    and test-other).
-
-    It contains all the common data pipeline modules used in ASR
-    experiments, e.g.:
-    - dynamic batch size,
-    - bucketing samplers,
-    - augmentation,
-    - on-the-fly feature extraction
-
-    This class should be derived for specific corpora used in ASR tasks.
-    """
-
-    def __init__(self, args: argparse.Namespace):
-        self.args = args
-
-    @classmethod
-    def add_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(
-            title="ASR data related options",
-            description="These options are used for the preparation of "
-            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
-            "effective batch sizes, sampling strategies, applied data "
-            "augmentations, etc.",
-        )
-        group.add_argument(
-            "--manifest-dir",
-            type=Path,
-            default=Path("data/manifests"),
-            help="Path to directory with train/valid/test cuts.",
-        )
-        group.add_argument(
-            "--max-duration",
-            type=int,
-            default=200.0,
-            help="Maximum pooled recordings duration (seconds) in a "
-            "single batch. You can reduce it if it causes CUDA OOM.",
-        )
-        group.add_argument(
-            "--max-cuts",
-            type=int,
-            default=100,
-            help="Maximum number of cuts in a single batch. You can "
-            "reduce it if it causes CUDA OOM.",
-        )
-        group.add_argument(
-            "--bucketing-sampler",
-            type=str2bool,
-            default=True,
-            help="When enabled, the batches will come from buckets of "
-            "similar duration (saves padding frames).",
-        )
-        group.add_argument(
-            "--num-buckets",
-            type=int,
-            default=30,
-            help="The number of buckets for the DynamicBucketingSampler"
-            "(you might want to increase it for larger datasets).",
-        )
-        group.add_argument(
-            "--on-the-fly-feats",
-            type=str2bool,
-            default=False,
-            help=(
-                "When enabled, use on-the-fly cut mixing and feature "
-                "extraction. Will drop existing precomputed feature manifests "
-                "if available."
-            ),
-        )
-        group.add_argument(
-            "--shuffle",
-            type=str2bool,
-            default=True,
-            help="When enabled (=default), the examples will be "
-            "shuffled for each epoch.",
-        )
-        group.add_argument(
-            "--drop-last",
-            type=str2bool,
-            default=True,
-            help="Whether to drop last batch. Used by sampler.",
-        )
-        group.add_argument(
-            "--return-cuts",
-            type=str2bool,
-            default=True,
-            help="When enabled, each batch will have the "
-            "field: batch['supervisions']['cut'] with the cuts that "
-            "were used to construct it.",
-        )
-
-        group.add_argument(
-            "--num-workers",
-            type=int,
-            default=2,
-            help="The number of training dataloader workers that "
-            "collect the batches.",
-        )
-
-        group.add_argument(
-            "--enable-spec-aug",
-            type=str2bool,
-            default=True,
-            help="When enabled, use SpecAugment for training dataset.",
-        )
-
-        group.add_argument(
-            "--spec-aug-time-warp-factor",
-            type=int,
-            default=80,
-            help="Used only when --enable-spec-aug is True. "
-            "It specifies the factor for time warping in SpecAugment. "
-            "Larger values mean more warping. "
-            "A value less than 1 means to disable time warp.",
-        )
-
-        group.add_argument(
-            "--enable-musan",
-            type=str2bool,
-            default=True,
-            help="When enabled, select noise from MUSAN and mix it"
-            "with training dataset. ",
-        )
-
-    def train_dataloaders(
-        self,
-        cuts_train: CutSet,
-        sampler_state_dict: Optional[Dict[str, Any]] = None,
-    ) -> DataLoader:
-        """
-        Args:
-          cuts_train:
-            CutSet for training.
-          sampler_state_dict:
-            The state dict for the training sampler.
-        """
-        transforms = []
-        if self.args.enable_musan:
-            logging.info("Enable MUSAN")
-            logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
-            transforms.append(
-                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
-            )
-        else:
-            logging.info("Disable MUSAN")
-
-        input_transforms = []
-        if self.args.enable_spec_aug:
-            logging.info("Enable SpecAugment")
-            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
-            # Set the value of num_frame_masks according to Lhotse's version.
-            # In different Lhotse's versions, the default of num_frame_masks is
-            # different.
-            num_frame_masks = 10
-            num_frame_masks_parameter = inspect.signature(
-                SpecAugment.__init__
-            ).parameters["num_frame_masks"]
-            if num_frame_masks_parameter.default == 1:
-                num_frame_masks = 2
-            logging.info(f"Num frame mask: {num_frame_masks}")
-            input_transforms.append(
-                SpecAugment(
-                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=num_frame_masks,
-                    features_mask_size=27,
-                    num_feature_masks=2,
-                    frames_mask_size=100,
-                )
-            )
-        else:
-            logging.info("Disable SpecAugment")
-
-        logging.info("About to create train dataset")
-        train = K2SurtDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
-            cut_transforms=transforms,
-            input_transforms=input_transforms,
-            return_cuts=self.args.return_cuts,
-        )
-
-        if self.args.bucketing_sampler:
-            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = DynamicBucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                quadratic_duration=30.0,
-                max_cuts=self.args.max_cuts,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                drop_last=self.args.drop_last,
-            )
-        else:
-            logging.info("Using SingleCutSampler.")
-            train_sampler = SimpleCutSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                max_cuts=self.args.max_cuts,
-                shuffle=self.args.shuffle,
-            )
-        logging.info("About to create train dataloader")
-
-        if sampler_state_dict is not None:
-            logging.info("Loading sampler state dict")
-            train_sampler.load_state_dict(sampler_state_dict)
-
-        # 'seed' is derived from the current random state, which will have
-        # previously been set in the main process.
-        seed = torch.randint(0, 100000, ()).item()
-        worker_init_fn = _SeedWorkers(seed)
-
-        train_dl = DataLoader(
-            train,
-            sampler=train_sampler,
-            batch_size=None,
-            num_workers=self.args.num_workers,
-            persistent_workers=False,
-            worker_init_fn=worker_init_fn,
-        )
-
-        return train_dl
-
-    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
-        transforms = []
-
-        logging.info("About to create dev dataset")
-        validate = K2SurtDataset(
-            input_strategy=OnTheFlyFeatures(
-                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            )
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
-            cut_transforms=transforms,
-            return_cuts=self.args.return_cuts,
-        )
-        valid_sampler = DynamicBucketingSampler(
-            cuts_valid,
-            max_duration=self.args.max_duration,
-            max_cuts=self.args.max_cuts,
-            shuffle=False,
-        )
-        logging.info("About to create dev dataloader")
-        valid_dl = DataLoader(
-            validate,
-            sampler=valid_sampler,
-            batch_size=None,
-            num_workers=2,
-            persistent_workers=False,
-        )
-
-        return valid_dl
-
-    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
-        logging.debug("About to create test dataset")
-        test = K2SurtDataset(
-            input_strategy=OnTheFlyFeatures(
-                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            )
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
-            return_cuts=self.args.return_cuts,
-        )
-        sampler = DynamicBucketingSampler(
-            cuts,
-            max_duration=self.args.max_duration,
-            max_cuts=self.args.max_cuts,
-            shuffle=False,
-        )
-        logging.debug("About to create test dataloader")
-        test_dl = DataLoader(
-            test,
-            batch_size=None,
-            sampler=sampler,
-            num_workers=self.args.num_workers,
-        )
-        return test_dl
-
-    @lru_cache()
-    def train_cuts(self, reverberated: bool = False) -> CutSet:
-        logging.info("About to get train cuts")
-        rvb_affix = "_rvb" if reverberated else "_norvb"
-        cs = load_manifest_lazy(
-            self.args.manifest_dir / f"cuts_train{rvb_affix}_v1.jsonl.gz"
-        )
-        # Trim to supervision groups
-        cs = cs.trim_to_supervision_groups(max_pause=1.0)
-        cs = cs.filter(lambda c: c.duration >= 1.0 and c.duration <= 30.0)
-        return cs
-
-    @lru_cache()
-    def dev_cuts(self, reverberated: bool = False) -> CutSet:
-        logging.info("About to get dev cuts")
-        rvb_affix = "_rvb" if reverberated else "_norvb"
-        cs = load_manifest_lazy(
-            self.args.manifest_dir / f"cuts_dev{rvb_affix}_v1.jsonl.gz"
-        )
-        cs = cs.filter(lambda c: c.duration >= 0.1)
-        return cs
-
-    @lru_cache()
-    def train_cuts_2spk(self, reverberated: bool = False) -> CutSet:
-        logging.info("About to get 2-spk train cuts")
-        rvb_affix = "_rvb" if reverberated else "_norvb"
-        cs = load_manifest_lazy(
-            self.args.manifest_dir / f"cuts_train_2spk{rvb_affix}.jsonl.gz"
-        )
-        cs = cs.filter(lambda c: c.duration >= 1.0 and c.duration <= 30.0)
-        return cs
-
-    @lru_cache()
-    def libricss_cuts(self, split="dev", type="sdm") -> CutSet:
-        logging.info(f"About to get LibriCSS {split} {type} cuts")
-        cs = load_manifest_lazy(
-            self.args.manifest_dir / f"cuts_{split}_libricss-{type}.jsonl.gz"
-        )
-        return cs
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/beam_search.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/beam_search.py
@ -1,885 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
-#                                                  Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Tuple, Union
-
-import k2
-import sentencepiece as spm
-import torch
-from model import SURT
-
-from icefall import NgramLm, NgramLmStateCost
-from icefall.decode import Nbest, one_best_decoding
-from icefall.lm_wrapper import LmScorer
-from icefall.utils import (
-    DecodingResults,
-    add_eos,
-    add_sos,
-    get_texts,
-    get_texts_with_timestamp,
-)
-
-
-def fast_beam_search_one_best(
-    model: SURT,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first obtained using fast beam search, and then
-    the shortest path within the lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `SURT`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-    )
-
-    best_path = one_best_decoding(lattice)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search(
-    model: SURT,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    temperature: float = 1.0,
-) -> k2.Fsa:
-    """It limits the maximum number of symbols per frame to 1.
-
-    Args:
-      model:
-        An instance of `SURT`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      temperature:
-        Softmax temperature.
-    Returns:
-      Return an FsaVec with axes [utt][state][arc] containing the decoded
-      lattice. Note: When the input graph is a TrivialGraph, the returned
-      lattice is actually an acceptor.
-    """
-    assert encoder_out.ndim == 3
-
-    context_size = model.decoder.context_size
-    vocab_size = model.decoder.vocab_size
-
-    B, T, C = encoder_out.shape
-
-    config = k2.RnntDecodingConfig(
-        vocab_size=vocab_size,
-        decoder_history_len=context_size,
-        beam=beam,
-        max_contexts=max_contexts,
-        max_states=max_states,
-    )
-    individual_streams = []
-    for i in range(B):
-        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
-    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # shape is a RaggedShape of shape (B, context)
-        # contexts is a Tensor of shape (shape.NumElements(), context_size)
-        shape, contexts = decoding_streams.get_contexts()
-        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
-        contexts = contexts.to(torch.int64)
-        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
-        decoder_out = model.decoder(contexts, need_pad=False)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # current_encoder_out is of shape
-        # (shape.NumElements(), 1, joiner_dim)
-        # fmt: off
-        current_encoder_out = torch.index_select(
-            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
-        )
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out.unsqueeze(2),
-            decoder_out.unsqueeze(1),
-            project_input=False,
-        )
-        logits = logits.squeeze(1).squeeze(1)
-        log_probs = (logits / temperature).log_softmax(dim=-1)
-        decoding_streams.advance(log_probs)
-    decoding_streams.terminate_and_flush_to_streams()
-    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
-
-    return lattice
-
-
-def greedy_search(
-    model: SURT,
-    encoder_out: torch.Tensor,
-    max_sym_per_frame: int,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """Greedy search for a single utterance.
-    Args:
-      model:
-        An instance of `SURT`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      max_sym_per_frame:
-        Maximum number of symbols per frame. If it is set to 0, the WER
-        would be 100%.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 4
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-    unk_id = getattr(model, "unk_id", blank_id)
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [-1] * (context_size - 1) + [blank_id], device=device, dtype=torch.int64
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-    hyp = [blank_id] * context_size
-
-    # timestamp[i] is the frame index after subsampling
-    # on which hyp[i] is decoded
-    timestamp = []
-
-    # Maximum symbols per utterance.
-    max_sym_per_utt = 1000
-
-    # symbols per frame
-    sym_per_frame = 0
-
-    # symbols per utterance decoded so far
-    sym_per_utt = 0
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        if sym_per_frame >= max_sym_per_frame:
-            sym_per_frame = 0
-            t += 1
-            continue
-
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits is (1, 1, 1, vocab_size)
-
-        y = logits.argmax().item()
-        if y not in (blank_id, unk_id):
-            hyp.append(y)
-            timestamp.append(t)
-            decoder_input = torch.tensor([hyp[-context_size:]], device=device).reshape(
-                1, context_size
-            )
-
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-            sym_per_utt += 1
-            sym_per_frame += 1
-        else:
-            sym_per_frame = 0
-            t += 1
-    hyp = hyp[context_size:]  # remove blanks
-
-    if not return_timestamps:
-        return hyp
-    else:
-        return DecodingResults(
-            hyps=[hyp],
-            timestamps=[timestamp],
-        )
-
-
-def greedy_search_batch(
-    model: SURT,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
-    Args:
-      model:
-        The SURT model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C), where N >= 1.
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    device = next(model.parameters()).device
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    hyps = [[-1] * (context_size - 1) + [blank_id] for _ in range(N)]
-
-    # timestamp[n][i] is the frame index after subsampling
-    # on which hyp[n][i] is decoded
-    timestamps = [[] for _ in range(N)]
-
-    decoder_input = torch.tensor(
-        hyps,
-        device=device,
-        dtype=torch.int64,
-    )  # (N, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-    # decoder_out: (N, 1, decoder_out_dim)
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    for (t, batch_size) in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        decoder_out = decoder_out[:batch_size]
-
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits'shape (batch_size, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
-        assert logits.ndim == 2, logits.shape
-        y = logits.argmax(dim=1).tolist()
-        emitted = False
-        for i, v in enumerate(y):
-            if v not in (blank_id, unk_id):
-                hyps[i].append(v)
-                timestamps[i].append(t)
-                emitted = True
-        if emitted:
-            # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
-            decoder_input = torch.tensor(
-                decoder_input,
-                device=device,
-                dtype=torch.int64,
-            )
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    sorted_ans = [h[context_size:] for h in hyps]
-    ans = []
-    ans_timestamps = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(timestamps[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-        )
-
-
-def modified_beam_search(
-    model: SURT,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: int = 4,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The SURT model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[blank_id] * context_size,
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for (t, batch_size) in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    new_timestamp.append(t)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(
-                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    sorted_timestamps = [h.timestamp for h in best_hyps]
-    ans = []
-    ans_timestamps = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-        )
-
-
-def beam_search(
-    model: SURT,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """
-    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
-
-    espnet/nets/beam_search_SURT.py#L247 is used as a reference.
-
-    Args:
-      model:
-        An instance of `SURT`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [blank_id] * context_size,
-        device=device,
-        dtype=torch.int64,
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-
-    B = HypothesisList()
-    B.add(Hypothesis(ys=[blank_id] * context_size, log_prob=0.0, timestamp=[]))
-
-    max_sym_per_utt = 20000
-
-    sym_per_utt = 0
-
-    decoder_cache: Dict[str, torch.Tensor] = {}
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        A = B
-        B = HypothesisList()
-
-        joint_cache: Dict[str, torch.Tensor] = {}
-
-        # TODO(fangjun): Implement prefix search to update the `log_prob`
-        # of hypotheses in A
-
-        while True:
-            y_star = A.get_most_probable()
-            A.remove(y_star)
-
-            cached_key = y_star.key
-
-            if cached_key not in decoder_cache:
-                decoder_input = torch.tensor(
-                    [y_star.ys[-context_size:]],
-                    device=device,
-                    dtype=torch.int64,
-                ).reshape(1, context_size)
-
-                decoder_out = model.decoder(decoder_input, need_pad=False)
-                decoder_out = model.joiner.decoder_proj(decoder_out)
-                decoder_cache[cached_key] = decoder_out
-            else:
-                decoder_out = decoder_cache[cached_key]
-
-            cached_key += f"-t-{t}"
-            if cached_key not in joint_cache:
-                logits = model.joiner(
-                    current_encoder_out,
-                    decoder_out.unsqueeze(1),
-                    project_input=False,
-                )
-
-                # TODO(fangjun): Scale the blank posterior
-                log_prob = (logits / temperature).log_softmax(dim=-1)
-                # log_prob is (1, 1, 1, vocab_size)
-                log_prob = log_prob.squeeze()
-                # Now log_prob is (vocab_size,)
-                joint_cache[cached_key] = log_prob
-            else:
-                log_prob = joint_cache[cached_key]
-
-            # First, process the blank symbol
-            skip_log_prob = log_prob[blank_id]
-            new_y_star_log_prob = y_star.log_prob + skip_log_prob
-
-            # ys[:] returns a copy of ys
-            B.add(
-                Hypothesis(
-                    ys=y_star.ys[:],
-                    log_prob=new_y_star_log_prob,
-                    timestamp=y_star.timestamp[:],
-                )
-            )
-
-            # Second, process other non-blank labels
-            values, indices = log_prob.topk(beam + 1)
-            for i, v in zip(indices.tolist(), values.tolist()):
-                if i in (blank_id, unk_id):
-                    continue
-                new_ys = y_star.ys + [i]
-                new_log_prob = y_star.log_prob + v
-                new_timestamp = y_star.timestamp + [t]
-                A.add(
-                    Hypothesis(
-                        ys=new_ys,
-                        log_prob=new_log_prob,
-                        timestamp=new_timestamp,
-                    )
-                )
-
-            # Check whether B contains more than "beam" elements more probable
-            # than the most probable in A
-            A_most_probable = A.get_most_probable()
-
-            kept_B = B.filter(A_most_probable.log_prob)
-
-            if len(kept_B) >= beam:
-                B = kept_B.topk(beam)
-                break
-
-        t += 1
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-
-    if not return_timestamps:
-        return ys
-    else:
-        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
-
-
-@dataclass
-class Hypothesis:
-    # The predicted tokens so far.
-    # Newly predicted tokens are appended to `ys`.
-    ys: List[int]
-
-    # The log prob of ys.
-    # It contains only one entry.
-    log_prob: torch.Tensor
-
-    # timestamp[i] is the frame index after subsampling
-    # on which ys[i] is decoded
-    timestamp: List[int] = field(default_factory=list)
-
-    # the lm score for next token given the current ys
-    lm_score: Optional[torch.Tensor] = None
-
-    # the RNNLM states (h and c in LSTM)
-    state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
-
-    # N-gram LM state
-    state_cost: Optional[NgramLmStateCost] = None
-
-    @property
-    def key(self) -> str:
-        """Return a string representation of self.ys"""
-        return "_".join(map(str, self.ys))
-
-
-class HypothesisList(object):
-    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
-        """
-        Args:
-          data:
-            A dict of Hypotheses. Its key is its `value.key`.
-        """
-        if data is None:
-            self._data = {}
-        else:
-            self._data = data
-
-    @property
-    def data(self) -> Dict[str, Hypothesis]:
-        return self._data
-
-    def add(self, hyp: Hypothesis) -> None:
-        """Add a Hypothesis to `self`.
-
-        If `hyp` already exists in `self`, its probability is updated using
-        `log-sum-exp` with the existed one.
-
-        Args:
-          hyp:
-            The hypothesis to be added.
-        """
-        key = hyp.key
-        if key in self:
-            old_hyp = self._data[key]  # shallow copy
-            torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
-        else:
-            self._data[key] = hyp
-
-    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
-        """Get the most probable hypothesis, i.e., the one with
-        the largest `log_prob`.
-
-        Args:
-          length_norm:
-            If True, the `log_prob` of a hypothesis is normalized by the
-            number of tokens in it.
-        Returns:
-          Return the hypothesis that has the largest `log_prob`.
-        """
-        if length_norm:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
-        else:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
-
-    def remove(self, hyp: Hypothesis) -> None:
-        """Remove a given hypothesis.
-
-        Caution:
-          `self` is modified **in-place**.
-
-        Args:
-          hyp:
-            The hypothesis to be removed from `self`.
-            Note: It must be contained in `self`. Otherwise,
-            an exception is raised.
-        """
-        key = hyp.key
-        assert key in self, f"{key} does not exist"
-        del self._data[key]
-
-    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
-        """Remove all Hypotheses whose log_prob is less than threshold.
-
-        Caution:
-          `self` is not modified. Instead, a new HypothesisList is returned.
-
-        Returns:
-          Return a new HypothesisList containing all hypotheses from `self`
-          with `log_prob` being greater than the given `threshold`.
-        """
-        ans = HypothesisList()
-        for _, hyp in self._data.items():
-            if hyp.log_prob > threshold:
-                ans.add(hyp)  # shallow copy
-        return ans
-
-    def topk(self, k: int) -> "HypothesisList":
-        """Return the top-k hypothesis."""
-        hyps = list(self._data.items())
-
-        hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
-
-        ans = HypothesisList(dict(hyps))
-        return ans
-
-    def __contains__(self, key: str):
-        return key in self._data
-
-    def __iter__(self):
-        return iter(self._data.values())
-
-    def __len__(self) -> int:
-        return len(self._data)
-
-    def __str__(self) -> str:
-        s = []
-        for key in self:
-            s.append(key)
-        return ", ".join(s)
-
-
-def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
-    """Return a ragged shape with axes [utt][num_hyps].
-
-    Args:
-      hyps:
-        len(hyps) == batch_size. It contains the current hypothesis for
-        each utterance in the batch.
-    Returns:
-      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
-      the shape is on CPU.
-    """
-    num_hyps = [len(h) for h in hyps]
-
-    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
-    # to get exclusive sum later.
-    num_hyps.insert(0, 0)
-
-    num_hyps = torch.tensor(num_hyps)
-    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
-    ans = k2.ragged.create_ragged_shape2(
-        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
-    )
-    return ans
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decode.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decode.py
@ -1,770 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
-#                                                 Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-(1) greedy search
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method greedy_search
-
-(2) beam search (not recommended)
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method beam_search \
-    --beam-size 4
-
-(3) modified beam search
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method modified_beam_search \
-    --beam-size 4
-
-(4) fast beam search (one best)
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method fast_beam_search \
-    --beam 20.0 \
-    --max-contexts 8 \
-    --max-states 64
-"""
-
-
-import argparse
-import logging
-from collections import defaultdict
-from itertools import chain, groupby, repeat
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from asr_datamodule import LibrimixAsrDataModule
-from beam_search import (
-    beam_search,
-    fast_beam_search_one_best,
-    greedy_search,
-    greedy_search_batch,
-    modified_beam_search,
-)
-from lhotse.utils import EPSILON
-from train import add_model_arguments, get_params, get_surt_model
-
-from icefall.checkpoint import (
-    average_checkpoints,
-    average_checkpoints_with_averaged_model,
-    find_checkpoints,
-    load_checkpoint,
-)
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    setup_logger,
-    store_transcripts,
-    str2bool,
-    write_surt_error_stats,
-)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=30,
-        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 1.
-        You can specify --avg to use more checkpoints for model averaging.""",
-    )
-
-    parser.add_argument(
-        "--iter",
-        type=int,
-        default=0,
-        help="""If positive, --epoch is ignored and it
-        will use the checkpoint exp_dir/checkpoint-iter.pt.
-        You can specify --avg to use more checkpoints for model averaging.
-        """,
-    )
-
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=9,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch' and '--iter'",
-    )
-
-    parser.add_argument(
-        "--use-averaged-model",
-        type=str2bool,
-        default=True,
-        help="Whether to load averaged model. Currently it only supports "
-        "using --epoch. If True, it would decode with the averaged model "
-        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
-        "Actually only the models with epoch number of `epoch-avg` and "
-        "`epoch` are loaded for averaging. ",
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="conv_lstm_transducer_stateless_ctc/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=Path,
-        default="data/lang_bpe_500",
-        help="The lang dir containing word table and LG graph",
-    )
-
-    parser.add_argument(
-        "--decoding-method",
-        type=str,
-        default="greedy_search",
-        help="""Possible values are:
-          - greedy_search
-          - beam_search
-          - modified_beam_search
-          - fast_beam_search
-        If you use fast_beam_search_nbest_LG, you have to specify
-        `--lang-dir`, which should contain `LG.pt`.
-        """,
-    )
-
-    parser.add_argument(
-        "--beam-size",
-        type=int,
-        default=4,
-        help="""An integer indicating how many candidates we will keep for each
-        frame. Used only when --decoding-method is beam_search or
-        modified_beam_search.""",
-    )
-
-    parser.add_argument(
-        "--beam",
-        type=float,
-        default=20.0,
-        help="""A floating point value to calculate the cutoff score during beam
-        search (i.e., `cutoff = max-score - beam`), which is the same as the
-        `beam` in Kaldi.
-        Used only when --decoding-method is fast_beam_search,
-        fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        and fast_beam_search_nbest_oracle
-        """,
-    )
-
-    parser.add_argument(
-        "--ngram-lm-scale",
-        type=float,
-        default=0.01,
-        help="""
-        Used only when --decoding_method is fast_beam_search_nbest_LG.
-        It specifies the scale for n-gram LM scores.
-        """,
-    )
-
-    parser.add_argument(
-        "--max-contexts",
-        type=int,
-        default=8,
-        help="""Used only when --decoding-method is
-        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--max-states",
-        type=int,
-        default=64,
-        help="""Used only when --decoding-method is
-        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
-    )
-    parser.add_argument(
-        "--max-sym-per-frame",
-        type=int,
-        default=1,
-        help="""Maximum number of symbols per frame.
-        Used only when --decoding_method is greedy_search""",
-    )
-
-    parser.add_argument(
-        "--num-paths",
-        type=int,
-        default=200,
-        help="""Number of paths for nbest decoding.
-        Used only when the decoding method is fast_beam_search_nbest,
-        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--nbest-scale",
-        type=float,
-        default=0.5,
-        help="""Scale applied to lattice scores when computing nbest paths.
-        Used only when the decoding method is fast_beam_search_nbest,
-        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--save-masks",
-        type=str2bool,
-        default=False,
-        help="""If true, save masks generated by unmixing module.""",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    batch: dict,
-    word_table: Optional[k2.SymbolTable] = None,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[List[str]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if greedy_search is used, it would be "greedy_search"
-               If beam search with a beam size of 7 is used, it would be
-               "beam_7"
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      word_table:
-        The word symbol table.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
-        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    device = next(model.parameters()).device
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-
-    feature = feature.to(device)
-    feature_lens = batch["input_lens"].to(device)
-
-    # Apply the mask encoder
-    B, T, F = feature.shape
-    processed = model.mask_encoder(feature)  # B,T,F*num_channels
-    masks = processed.view(B, T, F, params.num_channels).unbind(dim=-1)
-    x_masked = [feature * m for m in masks]
-
-    # To save the masks, we split them by batch and trim each mask to the length of
-    # the corresponding feature. We save them in a dict, where the key is the
-    # cut ID and the value is the mask.
-    masks_dict = {}
-    for i in range(B):
-        mask = torch.cat(
-            [x_masked[j][i, : feature_lens[i]] for j in range(params.num_channels)],
-            dim=-1,
-        )
-        mask = mask.cpu().numpy()
-        masks_dict[batch["cuts"][i].id] = mask
-
-    # Recognition
-    # Stack the inputs along the batch axis
-    h = torch.cat(x_masked, dim=0)
-    h_lens = torch.cat([feature_lens for _ in range(params.num_channels)], dim=0)
-    encoder_out, encoder_out_lens = model.encoder(x=h, x_lens=h_lens)
-
-    hyps = []
-    if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search_one_best(
-            model=model,
-            decoding_graph=decoding_graph,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam,
-            max_contexts=params.max_contexts,
-            max_states=params.max_states,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
-        hyp_tokens = greedy_search_batch(
-            model=model,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif params.decoding_method == "modified_beam_search":
-        hyp_tokens = modified_beam_search(
-            model=model,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam_size,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    else:
-        batch_size = encoder_out.size(0)
-
-        for i in range(batch_size):
-            # fmt: off
-            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-            # fmt: on
-            if params.decoding_method == "greedy_search":
-                hyp = greedy_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    max_sym_per_frame=params.max_sym_per_frame,
-                )
-            elif params.decoding_method == "beam_search":
-                hyp = beam_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    beam=params.beam_size,
-                )
-            else:
-                raise ValueError(
-                    f"Unsupported decoding method: {params.decoding_method}"
-                )
-            hyps.append(sp.decode(hyp).split())
-
-    if params.decoding_method == "greedy_search":
-        return {"greedy_search": hyps}, masks_dict
-    elif "fast_beam_search" in params.decoding_method:
-        key = f"beam_{params.beam}_"
-        key += f"max_contexts_{params.max_contexts}_"
-        key += f"max_states_{params.max_states}"
-        if "nbest" in params.decoding_method:
-            key += f"_num_paths_{params.num_paths}_"
-            key += f"nbest_scale_{params.nbest_scale}"
-            if "LG" in params.decoding_method:
-                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
-
-        return {key: hyps}, masks_dict
-    else:
-        return {f"beam_size_{params.beam_size}": hyps}, masks_dict
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    word_table: Optional[k2.SymbolTable] = None,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      word_table:
-        The word symbol table.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
-        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
-    Returns:
-      Return a dict, whose key may be "greedy_search" if greedy search
-      is used, or it may be "beam_7" if beam size of 7 is used.
-      Its value is a list of tuples. Each tuple contains two elements:
-      The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    if params.decoding_method == "greedy_search":
-        log_interval = 50
-    else:
-        log_interval = 20
-
-    results = defaultdict(list)
-    masks = {}
-    for batch_idx, batch in enumerate(dl):
-        # The dataloader returns text as a list of cuts, each of which is a list of channel
-        # text. We flatten this to a list where all channels are together, i.e., it looks like
-        # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
-        texts = [val for tup in zip(*batch["text"]) for val in tup]
-        cut_ids = [cut.id for cut in batch["cuts"]]
-
-        # Repeat cut_ids list N times, where N is the number of channels.
-        cut_ids = list(chain.from_iterable(repeat(cut_ids, params.num_channels)))
-
-        hyps_dict, masks_dict = decode_one_batch(
-            params=params,
-            model=model,
-            sp=sp,
-            decoding_graph=decoding_graph,
-            word_table=word_table,
-            batch=batch,
-        )
-        masks.update(masks_dict)
-
-        for name, hyps in hyps_dict.items():
-            this_batch = []
-            assert len(hyps) == len(texts), f"{len(hyps)} vs {len(texts)}"
-            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
-                ref_words = ref_text.split()
-                this_batch.append((cut_id, ref_words, hyp_words))
-
-            results[name].extend(this_batch)
-
-        num_cuts += len(texts)
-
-        if batch_idx % log_interval == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results, masks
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = (
-            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        results = sorted(results)
-        # Combine results by cut_id. This means that we combine different channels for
-        # ref and hyp of the same cut into list. Example:
-        # (cut1, ref1, hyp1), (cut1, ref2, hyp2), (cut2, ref3, hyp3) ->
-        # (cut1, [ref1, ref2], [hyp1, hyp2]), (cut2, [ref3], [hyp3])
-        # Also, each ref and hyp is currently a list of words. We join them into a string.
-        results_grouped = []
-        for cut_id, items in groupby(results, lambda x: x[0]):
-            items = list(items)
-            refs = [" ".join(item[1]) for item in items]
-            hyps = [" ".join(item[2]) for item in items]
-            results_grouped.append((cut_id, refs, hyps))
-
-        store_transcripts(filename=recog_path, texts=results_grouped)
-        logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = (
-            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        with open(errs_filename, "w") as f:
-            wer = write_surt_error_stats(
-                f, f"{test_set_name}-{key}", results_grouped, enable_log=True
-            )
-            test_set_wers[key] = wer
-
-        logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = (
-        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
-    )
-    with open(errs_info, "w") as f:
-        print("settings\tWER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-def save_masks(
-    params: AttributeDict,
-    test_set_name: str,
-    masks: List[torch.Tensor],
-):
-    masks_path = params.res_dir / f"masks-{test_set_name}.txt"
-    torch.save(masks, masks_path)
-    logging.info(f"The masks are stored in {masks_path}")
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    LibrimixAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    assert params.decoding_method in (
-        "greedy_search",
-        "beam_search",
-        "fast_beam_search",
-        "fast_beam_search_nbest",
-        "fast_beam_search_nbest_LG",
-        "fast_beam_search_nbest_oracle",
-        "modified_beam_search",
-    )
-    params.res_dir = params.exp_dir / params.decoding_method
-
-    if params.iter > 0:
-        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
-    else:
-        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-
-    if "fast_beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam}"
-        params.suffix += f"-max-contexts-{params.max_contexts}"
-        params.suffix += f"-max-states-{params.max_states}"
-        if "nbest" in params.decoding_method:
-            params.suffix += f"-nbest-scale-{params.nbest_scale}"
-            params.suffix += f"-num-paths-{params.num_paths}"
-            if "LG" in params.decoding_method:
-                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
-    elif "beam_search" in params.decoding_method:
-        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
-    else:
-        params.suffix += f"-context-{params.context_size}"
-        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
-
-    if params.use_averaged_model:
-        params.suffix += "-use-averaged-model"
-
-    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"Device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> and <unk> are defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_surt_model(params)
-    assert model.encoder.decode_chunk_size == params.decode_chunk_len // 2, (
-        model.encoder.decode_chunk_size,
-        params.decode_chunk_len,
-    )
-
-    if not params.use_averaged_model:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-        elif params.avg == 1:
-            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-        else:
-            start = params.epoch - params.avg + 1
-            filenames = []
-            for i in range(start, params.epoch + 1):
-                if i >= 1:
-                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-    else:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg + 1
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg + 1:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            filename_start = filenames[-1]
-            filename_end = filenames[0]
-            logging.info(
-                "Calculating the averaged model over iteration checkpoints"
-                f" from {filename_start} (excluded) to {filename_end}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-        else:
-            assert params.avg > 0, params.avg
-            start = params.epoch - params.avg
-            assert start >= 1, start
-            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-            logging.info(
-                f"Calculating the averaged model over epoch range from "
-                f"{start} (excluded) to {params.epoch}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-
-    model.to(device)
-    model.eval()
-
-    if "fast_beam_search" in params.decoding_method:
-        if params.decoding_method == "fast_beam_search_nbest_LG":
-            lexicon = Lexicon(params.lang_dir)
-            word_table = lexicon.word_table
-            lg_filename = params.lang_dir / "LG.pt"
-            logging.info(f"Loading {lg_filename}")
-            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
-            )
-            decoding_graph.scores *= params.ngram_lm_scale
-        else:
-            word_table = None
-            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
-    else:
-        decoding_graph = None
-        word_table = None
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    # we need cut ids to display recognition results.
-    args.return_cuts = True
-    librimix = LibrimixAsrDataModule(args)
-
-    dev_cuts = librimix.dev_cuts(reverberated=False)
-    dev_dl = librimix.test_dataloaders(dev_cuts)
-
-    test_sets = ["dev"]
-    test_dl = [dev_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dl):
-        results_dict, masks = decode_dataset(
-            dl=test_dl,
-            params=params,
-            model=model,
-            sp=sp,
-            word_table=word_table,
-            decoding_graph=decoding_graph,
-        )
-
-        save_results(
-            params=params,
-            test_set_name=test_set,
-            results_dict=results_dict,
-        )
-
-        if params.save_masks:
-            save_masks(
-                params=params,
-                test_set_name=test_set,
-                masks=masks,
-            )
-
-    logging.info("Done!")
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decode_libricss.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decode_libricss.py
@ -1,791 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
-#                                                 Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-(1) greedy search
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method greedy_search
-
-(2) beam search (not recommended)
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method beam_search \
-    --beam-size 4
-
-(3) modified beam search
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method modified_beam_search \
-    --beam-size 4
-
-(4) fast beam search (one best)
-./conv_lstm_transducer_stateless_ctc/decode.py \
-    --epoch 28 \
-    --avg 15 \
-    --exp-dir ./conv_lstm_transducer_stateless_ctc/exp \
-    --max-duration 600 \
-    --decoding-method fast_beam_search \
-    --beam 20.0 \
-    --max-contexts 8 \
-    --max-states 64
-"""
-
-
-import argparse
-import logging
-from collections import defaultdict
-from itertools import chain, groupby, repeat
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from asr_datamodule import LibrimixAsrDataModule
-from beam_search import (
-    beam_search,
-    fast_beam_search_one_best,
-    greedy_search,
-    greedy_search_batch,
-    modified_beam_search,
-)
-from lhotse.utils import EPSILON
-from train import add_model_arguments, get_params, get_surt_model
-
-from icefall.checkpoint import (
-    average_checkpoints,
-    average_checkpoints_with_averaged_model,
-    find_checkpoints,
-    load_checkpoint,
-)
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    setup_logger,
-    store_transcripts,
-    str2bool,
-    write_surt_error_stats,
-)
-
-OVERLAP_RATIOS = ["0L", "0S", "OV10", "OV20", "OV30", "OV40"]
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=30,
-        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 1.
-        You can specify --avg to use more checkpoints for model averaging.""",
-    )
-
-    parser.add_argument(
-        "--iter",
-        type=int,
-        default=0,
-        help="""If positive, --epoch is ignored and it
-        will use the checkpoint exp_dir/checkpoint-iter.pt.
-        You can specify --avg to use more checkpoints for model averaging.
-        """,
-    )
-
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=9,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch' and '--iter'",
-    )
-
-    parser.add_argument(
-        "--use-averaged-model",
-        type=str2bool,
-        default=True,
-        help="Whether to load averaged model. Currently it only supports "
-        "using --epoch. If True, it would decode with the averaged model "
-        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
-        "Actually only the models with epoch number of `epoch-avg` and "
-        "`epoch` are loaded for averaging. ",
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="conv_lstm_transducer_stateless_ctc/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=Path,
-        default="data/lang_bpe_500",
-        help="The lang dir containing word table and LG graph",
-    )
-
-    parser.add_argument(
-        "--decoding-method",
-        type=str,
-        default="greedy_search",
-        help="""Possible values are:
-          - greedy_search
-          - beam_search
-          - modified_beam_search
-          - fast_beam_search
-        If you use fast_beam_search_nbest_LG, you have to specify
-        `--lang-dir`, which should contain `LG.pt`.
-        """,
-    )
-
-    parser.add_argument(
-        "--beam-size",
-        type=int,
-        default=4,
-        help="""An integer indicating how many candidates we will keep for each
-        frame. Used only when --decoding-method is beam_search or
-        modified_beam_search.""",
-    )
-
-    parser.add_argument(
-        "--beam",
-        type=float,
-        default=20.0,
-        help="""A floating point value to calculate the cutoff score during beam
-        search (i.e., `cutoff = max-score - beam`), which is the same as the
-        `beam` in Kaldi.
-        Used only when --decoding-method is fast_beam_search,
-        fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        and fast_beam_search_nbest_oracle
-        """,
-    )
-
-    parser.add_argument(
-        "--ngram-lm-scale",
-        type=float,
-        default=0.01,
-        help="""
-        Used only when --decoding_method is fast_beam_search_nbest_LG.
-        It specifies the scale for n-gram LM scores.
-        """,
-    )
-
-    parser.add_argument(
-        "--max-contexts",
-        type=int,
-        default=8,
-        help="""Used only when --decoding-method is
-        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--max-states",
-        type=int,
-        default=64,
-        help="""Used only when --decoding-method is
-        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
-    )
-    parser.add_argument(
-        "--max-sym-per-frame",
-        type=int,
-        default=1,
-        help="""Maximum number of symbols per frame.
-        Used only when --decoding_method is greedy_search""",
-    )
-
-    parser.add_argument(
-        "--num-paths",
-        type=int,
-        default=200,
-        help="""Number of paths for nbest decoding.
-        Used only when the decoding method is fast_beam_search_nbest,
-        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--nbest-scale",
-        type=float,
-        default=0.5,
-        help="""Scale applied to lattice scores when computing nbest paths.
-        Used only when the decoding method is fast_beam_search_nbest,
-        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
-    )
-
-    parser.add_argument(
-        "--save-masks",
-        type=str2bool,
-        default=False,
-        help="""If true, save masks generated by unmixing module.""",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    batch: dict,
-    word_table: Optional[k2.SymbolTable] = None,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[List[str]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if greedy_search is used, it would be "greedy_search"
-               If beam search with a beam size of 7 is used, it would be
-               "beam_7"
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      word_table:
-        The word symbol table.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
-        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    device = next(model.parameters()).device
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-
-    feature = feature.to(device)
-    feature_lens = batch["input_lens"].to(device)
-
-    # Apply the mask encoder
-    B, T, F = feature.shape
-    processed = model.mask_encoder(feature)  # B,T,F*num_channels
-    masks = processed.view(B, T, F, params.num_channels).unbind(dim=-1)
-    x_masked = [feature * m for m in masks]
-
-    # Recognition
-    # Stack the inputs along the batch axis
-    h = torch.cat(x_masked, dim=0)
-    h_lens = torch.cat([feature_lens for _ in range(params.num_channels)], dim=0)
-    encoder_out, encoder_out_lens = model.encoder(x=h, x_lens=h_lens)
-
-    def _group_channels(hyps: List[str]) -> List[List[str]]:
-        """
-        Currently we have a batch of size M*B, where M is the number of
-        channels and B is the batch size. We need to group the hypotheses
-        into B groups, each of which contains M hypotheses.
-
-        Example:
-            hyps = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2']
-            _group_channels(hyps) = [['a1', 'a2'], ['b1', 'b2'], ['c1', 'c2']]
-        """
-        assert len(hyps) == B * params.num_channels
-        out_hyps = []
-        for i in range(B):
-            out_hyps.append(hyps[i::B])
-        return out_hyps
-
-    hyps = []
-    if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search_one_best(
-            model=model,
-            decoding_graph=decoding_graph,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam,
-            max_contexts=params.max_contexts,
-            max_states=params.max_states,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp)
-    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
-        hyp_tokens = greedy_search_batch(
-            model=model,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp)
-    elif params.decoding_method == "modified_beam_search":
-        hyp_tokens = modified_beam_search(
-            model=model,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam_size,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp)
-    else:
-        batch_size = encoder_out.size(0)
-
-        for i in range(batch_size):
-            # fmt: off
-            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-            # fmt: on
-            if params.decoding_method == "greedy_search":
-                hyp = greedy_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    max_sym_per_frame=params.max_sym_per_frame,
-                )
-            elif params.decoding_method == "beam_search":
-                hyp = beam_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    beam=params.beam_size,
-                )
-            else:
-                raise ValueError(
-                    f"Unsupported decoding method: {params.decoding_method}"
-                )
-            hyps.append(sp.decode(hyp))
-
-    if params.decoding_method == "greedy_search":
-        return {"greedy_search": _group_channels(hyps)}
-    elif "fast_beam_search" in params.decoding_method:
-        key = f"beam_{params.beam}_"
-        key += f"max_contexts_{params.max_contexts}_"
-        key += f"max_states_{params.max_states}"
-        if "nbest" in params.decoding_method:
-            key += f"_num_paths_{params.num_paths}_"
-            key += f"nbest_scale_{params.nbest_scale}"
-            if "LG" in params.decoding_method:
-                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
-
-        return {key: _group_channels(hyps)}
-    else:
-        return {f"beam_size_{params.beam_size}": _group_channels(hyps)}
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    word_table: Optional[k2.SymbolTable] = None,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      word_table:
-        The word symbol table.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
-        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
-    Returns:
-      Return a dict, whose key may be "greedy_search" if greedy search
-      is used, or it may be "beam_7" if beam size of 7 is used.
-      Its value is a list of tuples. Each tuple contains two elements:
-      The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    if params.decoding_method == "greedy_search":
-        log_interval = 50
-    else:
-        log_interval = 20
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        cut_ids = [cut.id for cut in batch["cuts"]]
-        cuts_batch = batch["cuts"]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            model=model,
-            sp=sp,
-            decoding_graph=decoding_graph,
-            word_table=word_table,
-            batch=batch,
-        )
-
-        for name, hyps in hyps_dict.items():
-            this_batch = []
-            for cut_id, hyp_words in zip(cut_ids, hyps):
-                # Reference is a list of supervision texts sorted by start time.
-                ref_words = [
-                    s.text.strip()
-                    for s in sorted(
-                        cuts_batch[cut_id].supervisions, key=lambda s: s.start
-                    )
-                ]
-                this_batch.append((cut_id, ref_words, hyp_words))
-
-            results[name].extend(this_batch)
-
-        num_cuts += len(cut_ids)
-
-        if batch_idx % log_interval == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = (
-            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
-        logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = (
-            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        with open(errs_filename, "w") as f:
-            wer = write_surt_error_stats(
-                f,
-                f"{test_set_name}-{key}",
-                results,
-                enable_log=True,
-                num_channels=params.num_channels,
-            )
-            test_set_wers[key] = wer
-
-        logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = (
-        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
-    )
-    with open(errs_info, "w") as f:
-        print("settings\tWER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-def save_masks(
-    params: AttributeDict,
-    test_set_name: str,
-    masks: List[torch.Tensor],
-):
-    masks_path = params.res_dir / f"masks-{test_set_name}.txt"
-    torch.save(masks, masks_path)
-    logging.info(f"The masks are stored in {masks_path}")
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    LibrimixAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    assert params.decoding_method in (
-        "greedy_search",
-        "beam_search",
-        "fast_beam_search",
-        "fast_beam_search_nbest",
-        "fast_beam_search_nbest_LG",
-        "fast_beam_search_nbest_oracle",
-        "modified_beam_search",
-    )
-    params.res_dir = params.exp_dir / params.decoding_method
-
-    if params.iter > 0:
-        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
-    else:
-        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-
-    if "fast_beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam}"
-        params.suffix += f"-max-contexts-{params.max_contexts}"
-        params.suffix += f"-max-states-{params.max_states}"
-        if "nbest" in params.decoding_method:
-            params.suffix += f"-nbest-scale-{params.nbest_scale}"
-            params.suffix += f"-num-paths-{params.num_paths}"
-            if "LG" in params.decoding_method:
-                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
-    elif "beam_search" in params.decoding_method:
-        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
-    else:
-        params.suffix += f"-context-{params.context_size}"
-        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
-
-    if params.use_averaged_model:
-        params.suffix += "-use-averaged-model"
-
-    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"Device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> and <unk> are defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_surt_model(params)
-    assert model.encoder.decode_chunk_size == params.decode_chunk_len // 2, (
-        model.encoder.decode_chunk_size,
-        params.decode_chunk_len,
-    )
-
-    if not params.use_averaged_model:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-        elif params.avg == 1:
-            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-        else:
-            start = params.epoch - params.avg + 1
-            filenames = []
-            for i in range(start, params.epoch + 1):
-                if i >= 1:
-                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-    else:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg + 1
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg + 1:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            filename_start = filenames[-1]
-            filename_end = filenames[0]
-            logging.info(
-                "Calculating the averaged model over iteration checkpoints"
-                f" from {filename_start} (excluded) to {filename_end}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-        else:
-            assert params.avg > 0, params.avg
-            start = params.epoch - params.avg
-            assert start >= 1, start
-            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-            logging.info(
-                f"Calculating the averaged model over epoch range from "
-                f"{start} (excluded) to {params.epoch}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-
-    model.to(device)
-    model.eval()
-
-    if "fast_beam_search" in params.decoding_method:
-        if params.decoding_method == "fast_beam_search_nbest_LG":
-            lexicon = Lexicon(params.lang_dir)
-            word_table = lexicon.word_table
-            lg_filename = params.lang_dir / "LG.pt"
-            logging.info(f"Loading {lg_filename}")
-            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
-            )
-            decoding_graph.scores *= params.ngram_lm_scale
-        else:
-            word_table = None
-            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
-    else:
-        decoding_graph = None
-        word_table = None
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    # we need cut ids to display recognition results.
-    args.return_cuts = True
-    librimix = LibrimixAsrDataModule(args)
-
-    dev_cuts = librimix.libricss_cuts(split="dev", type="ihm-mix").to_eager()
-    dev_cuts_grouped = [dev_cuts.filter(lambda x: ol in x.id) for ol in OVERLAP_RATIOS]
-    test_cuts = librimix.libricss_cuts(split="test", type="ihm-mix").to_eager()
-    test_cuts_grouped = [
-        test_cuts.filter(lambda x: ol in x.id) for ol in OVERLAP_RATIOS
-    ]
-
-    for dev_set, ol in zip(dev_cuts_grouped, OVERLAP_RATIOS):
-        dev_dl = librimix.test_dataloaders(dev_set)
-        results_dict = decode_dataset(
-            dl=dev_dl,
-            params=params,
-            model=model,
-            sp=sp,
-            word_table=word_table,
-            decoding_graph=decoding_graph,
-        )
-
-        save_results(
-            params=params,
-            test_set_name=f"dev_{ol}",
-            results_dict=results_dict,
-        )
-
-    # if params.save_masks:
-    #     save_masks(
-    #         params=params,
-    #         test_set_name=f"dev_{ol}",
-    #         masks=masks,
-    #     )
-
-    # for test_set, ol in zip(test_cuts_grouped, OVERLAP_RATIOS):
-    #     test_dl = librimix.test_dataloaders(test_set)
-    #     results_dict = decode_dataset(
-    #         dl=test_dl,
-    #         params=params,
-    #         model=model,
-    #         sp=sp,
-    #         word_table=word_table,
-    #         decoding_graph=decoding_graph,
-    #     )
-
-    #     save_results(
-    #         params=params,
-    #         test_set_name=f"test_{ol}",
-    #         results_dict=results_dict,
-    #     )
-
-    # if params.save_masks:
-    #     save_masks(
-    #         params=params,
-    #         test_set_name=f"test_{ol}",
-    #         masks=masks,
-    #     )
-
-    logging.info("Done!")
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decode_stream.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decode_stream.py
@ -1,151 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Wei Kang,
-#                                                  Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import List, Optional, Tuple
-
-import k2
-import torch
-from beam_search import Hypothesis, HypothesisList
-
-from icefall.utils import AttributeDict
-
-
-class DecodeStream(object):
-    def __init__(
-        self,
-        params: AttributeDict,
-        cut_id: str,
-        initial_states: List[torch.Tensor],
-        decoding_graph: Optional[k2.Fsa] = None,
-        device: torch.device = torch.device("cpu"),
-    ) -> None:
-        """
-        Args:
-          initial_states:
-            Initial decode states of the model, e.g. the return value of
-            `get_init_state` in conformer.py
-          decoding_graph:
-            Decoding graph used for decoding, may be a TrivialGraph or a HLG.
-            Used only when decoding_method is fast_beam_search.
-          device:
-            The device to run this stream.
-        """
-        if params.decoding_method == "fast_beam_search":
-            assert decoding_graph is not None
-            assert device == decoding_graph.device
-
-        self.params = params
-        self.cut_id = cut_id
-        self.LOG_EPS = math.log(1e-10)
-
-        self.states = initial_states
-
-        # It contains a 2-D tensors representing the feature frames.
-        self.features: torch.Tensor = None
-
-        self.num_frames: int = 0
-        # how many frames have been processed. (before subsampling).
-        # we only modify this value in `func:get_feature_frames`.
-        self.num_processed_frames: int = 0
-
-        self._done: bool = False
-
-        # The transcript of current utterance.
-        self.ground_truth: str = ""
-
-        # The decoding result (partial or final) of current utterance.
-        self.hyp: List = []
-
-        # how many frames have been processed, after subsampling (i.e. a
-        # cumulative sum of the second return value of
-        # encoder.streaming_forward
-        self.done_frames: int = 0
-
-        # It has two steps of feature subsampling in zipformer: out_lens=((x_lens-7)//2+1)//2
-        # 1) feature embedding: out_lens=(x_lens-7)//2
-        # 2) output subsampling: out_lens=(out_lens+1)//2
-        self.pad_length = 7
-
-        if params.decoding_method == "greedy_search":
-            self.hyp = [params.blank_id] * params.context_size
-        elif params.decoding_method == "modified_beam_search":
-            self.hyps = HypothesisList()
-            self.hyps.add(
-                Hypothesis(
-                    ys=[params.blank_id] * params.context_size,
-                    log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                )
-            )
-        elif params.decoding_method == "fast_beam_search":
-            # The rnnt_decoding_stream for fast_beam_search.
-            self.rnnt_decoding_stream: k2.RnntDecodingStream = k2.RnntDecodingStream(
-                decoding_graph
-            )
-        else:
-            raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
-
-    @property
-    def done(self) -> bool:
-        """Return True if all the features are processed."""
-        return self._done
-
-    @property
-    def id(self) -> str:
-        return self.cut_id
-
-    def set_features(
-        self,
-        features: torch.Tensor,
-        tail_pad_len: int = 0,
-    ) -> None:
-        """Set features tensor of current utterance."""
-        assert features.dim() == 2, features.dim()
-        self.features = torch.nn.functional.pad(
-            features,
-            (0, 0, 0, self.pad_length + tail_pad_len),
-            mode="constant",
-            value=self.LOG_EPS,
-        )
-        self.num_frames = self.features.size(0)
-
-    def get_feature_frames(self, chunk_size: int) -> Tuple[torch.Tensor, int]:
-        """Consume chunk_size frames of features"""
-        chunk_length = chunk_size + self.pad_length
-
-        ret_length = min(self.num_frames - self.num_processed_frames, chunk_length)
-
-        ret_features = self.features[
-            self.num_processed_frames : self.num_processed_frames + ret_length  # noqa
-        ]
-
-        self.num_processed_frames += chunk_size
-        if self.num_processed_frames >= self.num_frames:
-            self._done = True
-
-        return ret_features, ret_length
-
-    def decoding_result(self) -> List[int]:
-        """Obtain current decoding result."""
-        if self.params.decoding_method == "greedy_search":
-            return self.hyp[self.params.context_size :]  # noqa
-        elif self.params.decoding_method == "modified_beam_search":
-            best_hyp = self.hyps.get_most_probable(length_norm=True)
-            return best_hyp.ys[self.params.context_size :]  # noqa
-        else:
-            assert self.params.decoding_method == "fast_beam_search"
-            return self.hyp
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decoder.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/decoder.py
@ -1,102 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Decoder(nn.Module):
-    """This class modifies the stateless decoder from the following paper:
-
-        RNN-transducer with stateless prediction network
-        https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
-
-    It removes the recurrent connection from the decoder, i.e., the prediction
-    network. Different from the above paper, it adds an extra Conv1d
-    right after the embedding layer.
-
-    TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
-    """
-
-    def __init__(
-        self,
-        vocab_size: int,
-        decoder_dim: int,
-        blank_id: int,
-        context_size: int,
-    ):
-        """
-        Args:
-          vocab_size:
-            Number of tokens of the modeling unit including blank.
-          decoder_dim:
-            Dimension of the input embedding, and of the decoder output.
-          blank_id:
-            The ID of the blank symbol.
-          context_size:
-            Number of previous words to use to predict the next word.
-            1 means bigram; 2 means trigram. n means (n+1)-gram.
-        """
-        super().__init__()
-
-        self.embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=decoder_dim,
-            padding_idx=blank_id,
-        )
-        self.blank_id = blank_id
-
-        assert context_size >= 1, context_size
-        self.context_size = context_size
-        self.vocab_size = vocab_size
-        if context_size > 1:
-            self.conv = nn.Conv1d(
-                in_channels=decoder_dim,
-                out_channels=decoder_dim,
-                kernel_size=context_size,
-                padding=0,
-                groups=decoder_dim // 4,  # group size == 4
-                bias=False,
-            )
-
-    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
-        """
-        Args:
-          y:
-            A 2-D tensor of shape (N, U).
-          need_pad:
-            True to left pad the input. Should be True during training.
-            False to not pad the input. Should be False during inference.
-        Returns:
-          Return a tensor of shape (N, U, decoder_dim).
-        """
-        y = y.to(torch.int64)
-        # this stuff about clamp() is a temporary fix for a mismatch
-        # at utterance start, we use negative ids in beam_search.py
-        embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1)
-        if self.context_size > 1:
-            embedding_out = embedding_out.permute(0, 2, 1)
-            if need_pad is True:
-                embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0))
-            else:
-                # During inference time, there is no need to do extra padding
-                # as we only need one output
-                assert embedding_out.size(-1) == self.context_size
-            embedding_out = self.conv(embedding_out)
-            embedding_out = embedding_out.permute(0, 2, 1)
-        embedding_out = F.relu(embedding_out)
-        return embedding_out
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/dprnn.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/dprnn.py
@ -1,304 +0,0 @@
-import random
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from einops import rearrange
-from scaling import ActivationBalancer, BasicNorm, DoubleSwish, ScaledLinear, ScaledLSTM
-from torch.autograd import Variable
-
-EPS = torch.finfo(torch.get_default_dtype()).eps
-
-
-def _pad_segment(input, segment_size):
-    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L342
-    # input is the features: (B, N, T)
-    batch_size, dim, seq_len = input.shape
-    segment_stride = segment_size // 2
-
-    rest = segment_size - (segment_stride + seq_len % segment_size) % segment_size
-    if rest > 0:
-        pad = Variable(torch.zeros(batch_size, dim, rest)).type(input.type())
-        input = torch.cat([input, pad], 2)
-
-    pad_aux = Variable(torch.zeros(batch_size, dim, segment_stride)).type(input.type())
-    input = torch.cat([pad_aux, input, pad_aux], 2)
-
-    return input, rest
-
-
-def split_feature(input, segment_size):
-    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L358
-    # split the feature into chunks of segment size
-    # input is the features: (B, N, T)
-
-    input, rest = _pad_segment(input, segment_size)
-    batch_size, dim, seq_len = input.shape
-    segment_stride = segment_size // 2
-
-    segments1 = (
-        input[:, :, :-segment_stride]
-        .contiguous()
-        .view(batch_size, dim, -1, segment_size)
-    )
-    segments2 = (
-        input[:, :, segment_stride:]
-        .contiguous()
-        .view(batch_size, dim, -1, segment_size)
-    )
-    segments = (
-        torch.cat([segments1, segments2], 3)
-        .view(batch_size, dim, -1, segment_size)
-        .transpose(2, 3)
-    )
-
-    return segments.contiguous(), rest
-
-
-def merge_feature(input, rest):
-    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L385
-    # merge the splitted features into full utterance
-    # input is the features: (B, N, L, K)
-
-    batch_size, dim, segment_size, _ = input.shape
-    segment_stride = segment_size // 2
-    input = (
-        input.transpose(2, 3).contiguous().view(batch_size, dim, -1, segment_size * 2)
-    )  # B, N, K, L
-
-    input1 = (
-        input[:, :, :, :segment_size]
-        .contiguous()
-        .view(batch_size, dim, -1)[:, :, segment_stride:]
-    )
-    input2 = (
-        input[:, :, :, segment_size:]
-        .contiguous()
-        .view(batch_size, dim, -1)[:, :, :-segment_stride]
-    )
-
-    output = input1 + input2
-    if rest > 0:
-        output = output[:, :, :-rest]
-
-    return output.contiguous()  # B, N, T
-
-
-class RNNEncoderLayer(nn.Module):
-    """
-    RNNEncoderLayer is made up of lstm and feedforward networks.
-    Args:
-      input_size:
-        The number of expected features in the input (required).
-      hidden_size:
-        The hidden dimension of rnn layer.
-      dropout:
-        The dropout value (default=0.1).
-      layer_dropout:
-        The dropout value for model-level warmup (default=0.075).
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        hidden_size: int,
-        dropout: float = 0.1,
-        bidirectional: bool = False,
-    ) -> None:
-        super(RNNEncoderLayer, self).__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-
-        assert hidden_size >= input_size, (hidden_size, input_size)
-        self.lstm = ScaledLSTM(
-            input_size=input_size,
-            hidden_size=hidden_size // 2 if bidirectional else hidden_size,
-            proj_size=0,
-            num_layers=1,
-            dropout=0.0,
-            batch_first=True,
-            bidirectional=bidirectional,
-        )
-        self.norm_final = BasicNorm(input_size)
-
-        # try to ensure the output is close to zero-mean (or at least, zero-median).  # noqa
-        self.balancer = ActivationBalancer(
-            num_channels=input_size,
-            channel_dim=-1,
-            min_positive=0.45,
-            max_positive=0.55,
-            max_abs=6.0,
-        )
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        warmup: float = 1.0,
-    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Pass the input through the encoder layer.
-        Args:
-          src:
-            The sequence to the encoder layer (required).
-            Its shape is (S, N, E), where S is the sequence length,
-            N is the batch size, and E is the feature number.
-          states:
-            A tuple of 2 tensors (optional). It is for streaming inference.
-            states[0] is the hidden states of all layers,
-              with shape of (1, N, input_size);
-            states[1] is the cell states of all layers,
-              with shape of (1, N, hidden_size).
-        """
-        src_orig = src
-
-        # alpha = 1.0 means fully use this encoder layer, 0.0 would mean
-        # completely bypass it.
-        alpha = warmup if self.training else 1.0
-
-        # lstm module
-        src_lstm, new_states = self.lstm(src, states)
-        src = self.dropout(src_lstm) + src
-        src = self.norm_final(self.balancer(src))
-
-        if alpha != 1.0:
-            src = alpha * src + (1 - alpha) * src_orig
-
-        return src
-
-
-# dual-path RNN
-class DPRNN(nn.Module):
-    """Deep dual-path RNN.
-    Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py
-
-    args:
-        input_size: int, dimension of the input feature. The input should have shape
-                    (batch, seq_len, input_size).
-        hidden_size: int, dimension of the hidden state.
-        output_size: int, dimension of the output size.
-        dropout: float, dropout ratio. Default is 0.
-        num_blocks: int, number of stacked RNN layers. Default is 1.
-    """
-
-    def __init__(
-        self,
-        feature_dim,
-        input_size,
-        hidden_size,
-        output_size,
-        dropout=0.1,
-        num_blocks=1,
-        segment_size=50,
-        chunk_width_randomization=False,
-    ):
-        super().__init__()
-
-        self.input_size = input_size
-        self.output_size = output_size
-        self.hidden_size = hidden_size
-
-        self.segment_size = segment_size
-        self.chunk_width_randomization = chunk_width_randomization
-
-        self.input_embed = nn.Sequential(
-            ScaledLinear(feature_dim, input_size),
-            BasicNorm(input_size),
-            ActivationBalancer(
-                num_channels=input_size,
-                channel_dim=-1,
-                min_positive=0.45,
-                max_positive=0.55,
-            ),
-        )
-
-        # dual-path RNN
-        self.row_rnn = nn.ModuleList([])
-        self.col_rnn = nn.ModuleList([])
-        for _ in range(num_blocks):
-            # intra-RNN is non-causal
-            self.row_rnn.append(
-                RNNEncoderLayer(
-                    input_size, hidden_size, dropout=dropout, bidirectional=True
-                )
-            )
-            self.col_rnn.append(
-                RNNEncoderLayer(
-                    input_size, hidden_size, dropout=dropout, bidirectional=False
-                )
-            )
-
-        # output layer
-        self.out_embed = nn.Sequential(
-            ScaledLinear(input_size, output_size),
-            BasicNorm(output_size),
-            ActivationBalancer(
-                num_channels=output_size,
-                channel_dim=-1,
-                min_positive=0.45,
-                max_positive=0.55,
-            ),
-        )
-
-    def forward(self, input):
-        # input shape: B, T, F
-        input = self.input_embed(input)
-        B, T, D = input.shape
-
-        if self.chunk_width_randomization and self.training:
-            segment_size = random.randint(self.segment_size // 2, self.segment_size)
-        else:
-            segment_size = self.segment_size
-        input, rest = split_feature(input.transpose(1, 2), segment_size)
-        # input shape: batch, N, dim1, dim2
-        # apply RNN on dim1 first and then dim2
-        # output shape: B, output_size, dim1, dim2
-        # input = input.to(device)
-        batch_size, _, dim1, dim2 = input.shape
-        output = input
-        for i in range(len(self.row_rnn)):
-            row_input = (
-                output.permute(0, 3, 2, 1)
-                .contiguous()
-                .view(batch_size * dim2, dim1, -1)
-            )  # B*dim2, dim1, N
-            output = self.row_rnn[i](row_input)  # B*dim2, dim1, H
-            output = (
-                output.view(batch_size, dim2, dim1, -1).permute(0, 3, 2, 1).contiguous()
-            )  # B, N, dim1, dim2
-
-            col_input = (
-                output.permute(0, 2, 3, 1)
-                .contiguous()
-                .view(batch_size * dim1, dim2, -1)
-            )  # B*dim1, dim2, N
-            output = self.col_rnn[i](col_input)  # B*dim1, dim2, H
-            output = (
-                output.view(batch_size, dim1, dim2, -1).permute(0, 3, 1, 2).contiguous()
-            )  # B, N, dim1, dim2
-
-        output = merge_feature(output, rest)
-        output = output.transpose(1, 2)
-        output = self.out_embed(output)
-
-        # Apply ReLU to the output
-        output = torch.relu(output)
-
-        return output
-
-
-if __name__ == "__main__":
-
-    model = DPRNN(
-        80,
-        256,
-        256,
-        160,
-        dropout=0.1,
-        num_blocks=3,
-        segment_size=20,
-        chunk_width_randomization=True,
-    )
-    input = torch.randn(2, 1002, 80)
-    print(model(input).shape)
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/encoder_interface.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/encoder_interface.py
@ -1,43 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-import torch.nn as nn
-
-
-class EncoderInterface(nn.Module):
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-          x:
-            A tensor of shape (batch_size, input_seq_len, num_features)
-            containing the input features.
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames
-            in `x` before padding.
-        Returns:
-          Return a tuple containing two tensors:
-            - encoder_out, a tensor of (batch_size, out_seq_len, output_dim)
-              containing unnormalized probabilities, i.e., the output of a
-              linear layer.
-            - encoder_out_lens, a tensor of shape (batch_size,) containing
-              the number of frames in `encoder_out` before padding.
-        """
-        raise NotImplementedError("Please implement it in a subclass")
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/export.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/export.py
@ -1,320 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script converts several saved checkpoints
-# to a single one using model averaging.
-"""
-
-Usage:
-
-(1) Export to torchscript model using torch.jit.script()
-
-./pruned_transducer_stateless7_streaming/export.py \
-  --exp-dir ./pruned_transducer_stateless7_streaming/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
-  --epoch 30 \
-  --avg 9 \
-  --jit 1
-
-It will generate a file `cpu_jit.pt` in the given `exp_dir`. You can later
-load it by `torch.jit.load("cpu_jit.pt")`.
-
-Note `cpu` in the name `cpu_jit.pt` means the parameters when loaded into Python
-are on CPU. You can use `to("cuda")` to move them to a CUDA device.
-
-Check
-https://github.com/k2-fsa/sherpa
-for how to use the exported models outside of icefall.
-
-(2) Export `model.state_dict()`
-
-./pruned_transducer_stateless7_streaming/export.py \
-  --exp-dir ./pruned_transducer_stateless7_streaming/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
-  --epoch 20 \
-  --avg 10
-
-It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
-load it by `icefall.checkpoint.load_checkpoint()`.
-
-To use the generated file with `pruned_transducer_stateless7_streaming/decode.py`,
-you can do:
-
-    cd /path/to/exp_dir
-    ln -s pretrained.pt epoch-9999.pt
-
-    cd /path/to/egs/librispeech/ASR
-    ./pruned_transducer_stateless7_streaming/decode.py \
-        --exp-dir ./pruned_transducer_stateless7_streaming/exp \
-        --epoch 9999 \
-        --avg 1 \
-        --max-duration 600 \
-        --decoding-method greedy_search \
-        --bpe-model data/lang_bpe_500/bpe.model
-
-Check ./pretrained.py for its usage.
-
-Note: If you don't want to train a model from scratch, we have
-provided one for you. You can get it at
-
-https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
-
-with the following commands:
-
-    sudo apt-get install git-lfs
-    git lfs install
-    git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
-    # You will find the pre-trained model in icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/exp
-"""
-
-import argparse
-import logging
-from pathlib import Path
-
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from scaling_converter import convert_scaled_to_non_scaled
-from train import add_model_arguments, get_params, get_transducer_model
-
-from icefall.checkpoint import (
-    average_checkpoints,
-    average_checkpoints_with_averaged_model,
-    find_checkpoints,
-    load_checkpoint,
-)
-from icefall.utils import str2bool
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=30,
-        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 1.
-        You can specify --avg to use more checkpoints for model averaging.""",
-    )
-
-    parser.add_argument(
-        "--iter",
-        type=int,
-        default=0,
-        help="""If positive, --epoch is ignored and it
-        will use the checkpoint exp_dir/checkpoint-iter.pt.
-        You can specify --avg to use more checkpoints for model averaging.
-        """,
-    )
-
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=9,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch' and '--iter'",
-    )
-
-    parser.add_argument(
-        "--use-averaged-model",
-        type=str2bool,
-        default=True,
-        help="Whether to load averaged model. Currently it only supports "
-        "using --epoch. If True, it would decode with the averaged model "
-        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
-        "Actually only the models with epoch number of `epoch-avg` and "
-        "`epoch` are loaded for averaging. ",
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="pruned_transducer_stateless7_streaming/exp",
-        help="""It specifies the directory where all training related
-        files, e.g., checkpoints, log, etc, are saved
-        """,
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--jit",
-        type=str2bool,
-        default=False,
-        help="""True to save a model after applying torch.jit.script.
-        It will generate a file named cpu_jit.pt
-
-        Check ./jit_pretrained.py for how to use it.
-        """,
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-@torch.no_grad()
-def main():
-    args = get_parser().parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_transducer_model(params)
-
-    model.to(device)
-
-    if not params.use_averaged_model:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-        elif params.avg == 1:
-            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-        else:
-            start = params.epoch - params.avg + 1
-            filenames = []
-            for i in range(start, params.epoch + 1):
-                if i >= 1:
-                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-    else:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg + 1
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg + 1:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            filename_start = filenames[-1]
-            filename_end = filenames[0]
-            logging.info(
-                "Calculating the averaged model over iteration checkpoints"
-                f" from {filename_start} (excluded) to {filename_end}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-        else:
-            assert params.avg > 0, params.avg
-            start = params.epoch - params.avg
-            assert start >= 1, start
-            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-            logging.info(
-                f"Calculating the averaged model over epoch range from "
-                f"{start} (excluded) to {params.epoch}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-
-    model.to("cpu")
-    model.eval()
-
-    if params.jit is True:
-        convert_scaled_to_non_scaled(model, inplace=True)
-        # We won't use the forward() method of the model in C++, so just ignore
-        # it here.
-        # Otherwise, one of its arguments is a ragged tensor and is not
-        # torch scriptabe.
-        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
-        logging.info("Using torch.jit.script")
-        model = torch.jit.script(model)
-        filename = params.exp_dir / "cpu_jit.pt"
-        model.save(str(filename))
-        logging.info(f"Saved to {filename}")
-    else:
-        logging.info("Not using torchscript. Export model.state_dict()")
-        # Save it using a format so that it can be loaded
-        # by :func:`load_checkpoint`
-        filename = params.exp_dir / "pretrained.pt"
-        torch.save({"model": model.state_dict()}, str(filename))
-        logging.info(f"Saved to {filename}")
-
-
-if __name__ == "__main__":
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    main()
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/joiner.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/joiner.py
@ -1,65 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-
-
-class Joiner(nn.Module):
-    def __init__(
-        self,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        super().__init__()
-
-        self.encoder_proj = nn.Linear(encoder_dim, joiner_dim)
-        self.decoder_proj = nn.Linear(decoder_dim, joiner_dim)
-        self.output_linear = nn.Linear(joiner_dim, vocab_size)
-
-    def forward(
-        self,
-        encoder_out: torch.Tensor,
-        decoder_out: torch.Tensor,
-        project_input: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-          encoder_out:
-            Output from the encoder. Its shape is (N, T, s_range, C).
-          decoder_out:
-            Output from the decoder. Its shape is (N, T, s_range, C).
-           project_input:
-            If true, apply input projections encoder_proj and decoder_proj.
-            If this is false, it is the user's responsibility to do this
-            manually.
-        Returns:
-          Return a tensor of shape (N, T, s_range, C).
-        """
-        assert encoder_out.ndim == decoder_out.ndim
-        assert encoder_out.ndim in (2, 4)
-        assert encoder_out.shape[:-1] == decoder_out.shape[:-1]
-
-        if project_input:
-            logit = self.encoder_proj(encoder_out) + self.decoder_proj(decoder_out)
-        else:
-            logit = encoder_out + decoder_out
-
-        logit = self.output_linear(torch.tanh(logit))
-
-        return logit
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/model.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/model.py
@ -1,304 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Tuple
-
-import k2
-import torch
-import torch.nn as nn
-from encoder_interface import EncoderInterface
-
-from icefall.utils import add_sos
-
-
-class SURT(nn.Module):
-    """It implements Streaming Unmixing and Recognition Transducer (SURT).
-    https://arxiv.org/abs/2011.13148
-    """
-
-    def __init__(
-        self,
-        mask_encoder: nn.Module,
-        encoder: EncoderInterface,
-        decoder: nn.Module,
-        joiner: nn.Module,
-        num_channels: int,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        """
-        Args:
-          mask_encoder:
-            It is the masking network. It generates a mask for each channel of the
-            encoder. These masks are applied to the input features, and then passed
-            to the transcription network.
-          encoder:
-            It is the transcription network in the paper. Its accepts
-            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
-            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
-            `logit_lens` of shape (N,).
-          decoder:
-            It is the prediction network in the paper. Its input shape
-            is (N, U) and its output shape is (N, U, decoder_dim).
-            It should contain one attribute: `blank_id`.
-          joiner:
-            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
-            Its output shape is (N, T, U, vocab_size). Note that its output contains
-            unnormalized probs, i.e., not processed by log-softmax.
-          num_channels:
-            It is the number of channels that the input features will be split into.
-            In general, it should be equal to the maximum number of simultaneously
-            active speakers. For most real scenarios, using 2 channels is sufficient.
-        """
-        super().__init__()
-        assert isinstance(encoder, EncoderInterface), type(encoder)
-        assert hasattr(decoder, "blank_id")
-
-        self.mask_encoder = mask_encoder
-        self.encoder = encoder
-        self.decoder = decoder
-        self.joiner = joiner
-        self.num_channels = num_channels
-
-        self.simple_am_proj = nn.Linear(
-            encoder_dim,
-            vocab_size,
-        )
-        self.simple_lm_proj = nn.Linear(decoder_dim, vocab_size)
-
-        self.ctc_output = nn.Sequential(
-            nn.Dropout(p=0.1),
-            nn.Linear(encoder_dim, vocab_size),
-            nn.LogSoftmax(dim=-1),
-        )
-
-    def forward_helper(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: k2.RaggedTensor,
-        prune_range: int = 5,
-        am_scale: float = 0.0,
-        lm_scale: float = 0.0,
-        reduction: str = "sum",
-        beam_size: int = 10,
-        use_double_scores: bool = False,
-        subsampling_factor: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Compute transducer loss for one branch of the SURT model.
-        """
-        encoder_out, x_lens = self.encoder(x, x_lens)
-        assert torch.all(x_lens > 0)
-
-        # compute ctc log-probs
-        ctc_output = self.ctc_output(encoder_out)
-
-        # For the decoder, i.e., the prediction network
-        row_splits = y.shape.row_splits(1)
-        y_lens = row_splits[1:] - row_splits[:-1]
-
-        blank_id = self.decoder.blank_id
-        sos_y = add_sos(y, sos_id=blank_id)
-
-        # sos_y_padded: [B, S + 1], start with SOS.
-        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
-
-        # decoder_out: [B, S + 1, decoder_dim]
-        decoder_out = self.decoder(sos_y_padded)
-
-        # Note: y does not start with SOS
-        # y_padded : [B, S]
-        y_padded = y.pad(mode="constant", padding_value=0)
-
-        y_padded = y_padded.to(torch.int64)
-        boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device)
-        boundary[:, 2] = y_lens
-        boundary[:, 3] = x_lens
-
-        lm = self.simple_lm_proj(decoder_out)
-        am = self.simple_am_proj(encoder_out)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
-                lm=lm.float(),
-                am=am.float(),
-                symbols=y_padded,
-                termination_symbol=blank_id,
-                lm_only_scale=lm_scale,
-                am_only_scale=am_scale,
-                boundary=boundary,
-                reduction=reduction,
-                return_grad=True,
-            )
-
-        # ranges : [B, T, prune_range]
-        ranges = k2.get_rnnt_prune_ranges(
-            px_grad=px_grad,
-            py_grad=py_grad,
-            boundary=boundary,
-            s_range=prune_range,
-        )
-
-        # am_pruned : [B, T, prune_range, encoder_dim]
-        # lm_pruned : [B, T, prune_range, decoder_dim]
-        am_pruned, lm_pruned = k2.do_rnnt_pruning(
-            am=self.joiner.encoder_proj(encoder_out),
-            lm=self.joiner.decoder_proj(decoder_out),
-            ranges=ranges,
-        )
-
-        # logits : [B, T, prune_range, vocab_size]
-
-        # project_input=False since we applied the decoder's input projections
-        # prior to do_rnnt_pruning (this is an optimization for speed).
-        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            pruned_loss = k2.rnnt_loss_pruned(
-                logits=logits.float(),
-                symbols=y_padded,
-                ranges=ranges,
-                termination_symbol=blank_id,
-                boundary=boundary,
-                reduction=reduction,
-            )
-
-        # Compute ctc loss
-        supervision_segments = torch.stack(
-            (
-                torch.arange(len(x_lens), device="cpu"),
-                torch.zeros_like(x_lens, device="cpu"),
-                torch.clone(x_lens).detach().cpu(),
-            ),
-            dim=1,
-        ).to(torch.int32)
-        # We need to sort supervision_segments in decreasing order of num_frames
-        indices = torch.argsort(supervision_segments[:, 2], descending=True)
-        supervision_segments = supervision_segments[indices]
-
-        # Works with a BPE model
-        decoding_graph = k2.ctc_graph(y, modified=False, device=x.device)
-        dense_fsa_vec = k2.DenseFsaVec(
-            ctc_output,
-            supervision_segments,
-            allow_truncate=subsampling_factor - 1,
-        )
-        ctc_loss = k2.ctc_loss(
-            decoding_graph=decoding_graph,
-            dense_fsa_vec=dense_fsa_vec,
-            output_beam=beam_size,
-            reduction="none",
-            use_double_scores=use_double_scores,
-        )
-
-        return (simple_loss, pruned_loss, ctc_loss)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: k2.RaggedTensor,
-        prune_range: int = 5,
-        am_scale: float = 0.0,
-        lm_scale: float = 0.0,
-        reduction: str = "sum",
-        beam_size: int = 10,
-        use_double_scores: bool = False,
-        subsampling_factor: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Args:
-          x:
-            A 3-D tensor of shape (N, T, C).
-          x_lens:
-            A 1-D tensor of shape (N,). It contains the number of frames in `x`
-            before padding.
-          y:
-            A ragged tensor of shape (N*num_channels, S). It contains the labels
-            of the N utterances. The labels are in the range [0, vocab_size). All
-            the channels are concatenated together one after another.
-          prune_range:
-            The prune range for rnnt loss, it means how many symbols(context)
-            we are considering for each frame to compute the loss.
-          am_scale:
-            The scale to smooth the loss with am (output of encoder network)
-            part
-          lm_scale:
-            The scale to smooth the loss with lm (output of predictor network)
-            part
-          reduction:
-            "sum" to sum the losses over all utterances in the batch.
-            "none" to return the loss in a 1-D tensor for each utterance
-            in the batch.
-          beam_size:
-            The beam size used in CTC decoding.
-          use_double_scores:
-            If True, use double precision for CTC decoding.
-          subsampling_factor:
-            The subsampling factor of the model. It is used to compute the
-            supervision segments for CTC loss.
-        Returns:
-          Return the transducer loss.
-
-        Note:
-           Regarding am_scale & lm_scale, it will make the loss-function one of
-           the form:
-              lm_scale * lm_probs + am_scale * am_probs +
-              (1-lm_scale-am_scale) * combined_probs
-        """
-        assert x.ndim == 3, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert y.num_axes == 2, y.num_axes
-
-        assert x.size(0) == x_lens.size(0), (x.size(), x_lens.size())
-
-        # Apply the mask encoder
-        B, T, F = x.shape
-        processed = self.mask_encoder(x)  # B,T,F*num_channels
-        masks = processed.view(B, T, F, self.num_channels).unbind(dim=-1)
-        x_masked = [x * m for m in masks]
-
-        # Recognition
-        # Stack the inputs along the batch axis
-        h = torch.cat(x_masked, dim=0)
-        h_lens = torch.cat([x_lens for _ in range(self.num_channels)], dim=0)
-
-        simple_loss, pruned_loss, ctc_loss = self.forward_helper(
-            h,
-            h_lens,
-            y,
-            prune_range,
-            am_scale,
-            lm_scale,
-            reduction=reduction,
-            beam_size=beam_size,
-            use_double_scores=use_double_scores,
-            subsampling_factor=subsampling_factor,
-        )
-
-        # Chunks the outputs into 2 parts along batch axis and then stack them along a new axis.
-        simple_loss = torch.stack(
-            torch.chunk(simple_loss, self.num_channels, dim=0), dim=0
-        )
-        pruned_loss = torch.stack(
-            torch.chunk(pruned_loss, self.num_channels, dim=0), dim=0
-        )
-        ctc_loss = torch.stack(torch.chunk(ctc_loss, self.num_channels, dim=0), dim=0)
-
-        return (simple_loss, pruned_loss, ctc_loss)
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/optim.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/optim.py
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/pretrained.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/pretrained.py
@ -1,355 +0,0 @@
-#!/usr/bin/env python3
-# Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This script loads a checkpoint and uses it to decode waves.
-You can generate the checkpoint with the following command:
-
-./pruned_transducer_stateless7_streaming/export.py \
-  --exp-dir ./pruned_transducer_stateless7_streaming/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
-  --epoch 20 \
-  --avg 10
-
-Usage of this script:
-
-(1) greedy search
-./pruned_transducer_stateless7_streaming/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
-    --bpe-model ./data/lang_bpe_500/bpe.model \
-    --method greedy_search \
-    /path/to/foo.wav \
-    /path/to/bar.wav
-
-(2) beam search
-./pruned_transducer_stateless7_streaming/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
-    --bpe-model ./data/lang_bpe_500/bpe.model \
-    --method beam_search \
-    --beam-size 4 \
-    /path/to/foo.wav \
-    /path/to/bar.wav
-
-(3) modified beam search
-./pruned_transducer_stateless7_streaming/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
-    --bpe-model ./data/lang_bpe_500/bpe.model \
-    --method modified_beam_search \
-    --beam-size 4 \
-    /path/to/foo.wav \
-    /path/to/bar.wav
-
-(4) fast beam search
-./pruned_transducer_stateless7_streaming/pretrained.py \
-    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
-    --bpe-model ./data/lang_bpe_500/bpe.model \
-    --method fast_beam_search \
-    --beam-size 4 \
-    /path/to/foo.wav \
-    /path/to/bar.wav
-
-You can also use `./pruned_transducer_stateless7_streaming/exp/epoch-xx.pt`.
-
-Note: ./pruned_transducer_stateless7_streaming/exp/pretrained.pt is generated by
-./pruned_transducer_stateless7_streaming/export.py
-"""
-
-
-import argparse
-import logging
-import math
-from typing import List
-
-import k2
-import kaldifeat
-import sentencepiece as spm
-import torch
-import torchaudio
-from beam_search import (
-    beam_search,
-    fast_beam_search_one_best,
-    greedy_search,
-    greedy_search_batch,
-    modified_beam_search,
-)
-from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_transducer_model
-
-from icefall.utils import str2bool
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--checkpoint",
-        type=str,
-        required=True,
-        help="Path to the checkpoint. "
-        "The checkpoint is assumed to be saved by "
-        "icefall.checkpoint.save_checkpoint().",
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        help="""Path to bpe.model.""",
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="greedy_search",
-        help="""Possible values are:
-          - greedy_search
-          - beam_search
-          - modified_beam_search
-          - fast_beam_search
-        """,
-    )
-
-    parser.add_argument(
-        "sound_files",
-        type=str,
-        nargs="+",
-        help="The input sound file(s) to transcribe. "
-        "Supported formats are those supported by torchaudio.load(). "
-        "For example, wav and flac are supported. "
-        "The sample rate has to be 16kHz.",
-    )
-
-    parser.add_argument(
-        "--sample-rate",
-        type=int,
-        default=16000,
-        help="The sample rate of the input sound file",
-    )
-
-    parser.add_argument(
-        "--beam-size",
-        type=int,
-        default=4,
-        help="""An integer indicating how many candidates we will keep for each
-        frame. Used only when --method is beam_search or
-        modified_beam_search.""",
-    )
-
-    parser.add_argument(
-        "--beam",
-        type=float,
-        default=4,
-        help="""A floating point value to calculate the cutoff score during beam
-        search (i.e., `cutoff = max-score - beam`), which is the same as the
-        `beam` in Kaldi.
-        Used only when --method is fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-contexts",
-        type=int,
-        default=4,
-        help="""Used only when --method is fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-states",
-        type=int,
-        default=8,
-        help="""Used only when --method is fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
-    )
-    parser.add_argument(
-        "--max-sym-per-frame",
-        type=int,
-        default=1,
-        help="""Maximum number of symbols per frame. Used only when
-        --method is greedy_search.
-        """,
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def read_sound_files(
-    filenames: List[str], expected_sample_rate: float
-) -> List[torch.Tensor]:
-    """Read a list of sound files into a list 1-D float32 torch tensors.
-    Args:
-      filenames:
-        A list of sound filenames.
-      expected_sample_rate:
-        The expected sample rate of the sound files.
-    Returns:
-      Return a list of 1-D float32 torch tensors.
-    """
-    ans = []
-    for f in filenames:
-        wave, sample_rate = torchaudio.load(f)
-        assert (
-            sample_rate == expected_sample_rate
-        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
-        # We use only the first channel
-        ans.append(wave[0])
-    return ans
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    params = get_params()
-
-    params.update(vars(args))
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(f"{params}")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"device: {device}")
-
-    logging.info("Creating model")
-    model = get_transducer_model(params)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
-    model.load_state_dict(checkpoint["model"], strict=False)
-    model.to(device)
-    model.eval()
-    model.device = device
-
-    logging.info("Constructing Fbank computer")
-    opts = kaldifeat.FbankOptions()
-    opts.device = device
-    opts.frame_opts.dither = 0
-    opts.frame_opts.snip_edges = False
-    opts.frame_opts.samp_freq = params.sample_rate
-    opts.mel_opts.num_bins = params.feature_dim
-
-    fbank = kaldifeat.Fbank(opts)
-
-    logging.info(f"Reading sound files: {params.sound_files}")
-    waves = read_sound_files(
-        filenames=params.sound_files, expected_sample_rate=params.sample_rate
-    )
-    waves = [w.to(device) for w in waves]
-
-    logging.info("Decoding started")
-    features = fbank(waves)
-    feature_lengths = [f.size(0) for f in features]
-
-    features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
-
-    feature_lengths = torch.tensor(feature_lengths, device=device)
-
-    encoder_out, encoder_out_lens = model.encoder(x=features, x_lens=feature_lengths)
-
-    num_waves = encoder_out.size(0)
-    hyps = []
-    msg = f"Using {params.method}"
-    if params.method == "beam_search":
-        msg += f" with beam size {params.beam_size}"
-    logging.info(msg)
-
-    if params.method == "fast_beam_search":
-        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
-        hyp_tokens = fast_beam_search_one_best(
-            model=model,
-            decoding_graph=decoding_graph,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam,
-            max_contexts=params.max_contexts,
-            max_states=params.max_states,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif params.method == "modified_beam_search":
-        hyp_tokens = modified_beam_search(
-            model=model,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-            beam=params.beam_size,
-        )
-
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
-        hyp_tokens = greedy_search_batch(
-            model=model,
-            encoder_out=encoder_out,
-            encoder_out_lens=encoder_out_lens,
-        )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
-    else:
-        for i in range(num_waves):
-            # fmt: off
-            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-            # fmt: on
-            if params.method == "greedy_search":
-                hyp = greedy_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    max_sym_per_frame=params.max_sym_per_frame,
-                )
-            elif params.method == "beam_search":
-                hyp = beam_search(
-                    model=model,
-                    encoder_out=encoder_out_i,
-                    beam=params.beam_size,
-                )
-            else:
-                raise ValueError(f"Unsupported method: {params.method}")
-
-            hyps.append(sp.decode(hyp).split())
-
-    s = "\n"
-    for filename, hyp in zip(params.sound_files, hyps):
-        words = " ".join(hyp)
-        s += f"{filename}:\n{words}\n\n"
-    logging.info(s)
-
-    logging.info("Decoding Done")
-
-
-if __name__ == "__main__":
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    main()
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/scaling.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/scaling.py
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/scaling_converter.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/scaling_converter.py
@ -1,114 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This file replaces various modules in a model.
-Specifically, ActivationBalancer is replaced with an identity operator;
-Whiten is also replaced with an identity operator;
-BasicNorm is replaced by a module with `exp` removed.
-"""
-
-import copy
-from typing import List
-
-import torch
-import torch.nn as nn
-from scaling import ActivationBalancer, BasicNorm, Whiten
-
-
-class NonScaledNorm(nn.Module):
-    """See BasicNorm for doc"""
-
-    def __init__(
-        self,
-        num_channels: int,
-        eps_exp: float,
-        channel_dim: int = -1,  # CAUTION: see documentation.
-    ):
-        super().__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.eps_exp = eps_exp
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if not torch.jit.is_tracing():
-            assert x.shape[self.channel_dim] == self.num_channels
-        scales = (
-            torch.mean(x * x, dim=self.channel_dim, keepdim=True) + self.eps_exp
-        ).pow(-0.5)
-        return x * scales
-
-
-def convert_basic_norm(basic_norm: BasicNorm) -> NonScaledNorm:
-    assert isinstance(basic_norm, BasicNorm), type(BasicNorm)
-    norm = NonScaledNorm(
-        num_channels=basic_norm.num_channels,
-        eps_exp=basic_norm.eps.data.exp().item(),
-        channel_dim=basic_norm.channel_dim,
-    )
-    return norm
-
-
-# Copied from https://pytorch.org/docs/1.9.0/_modules/torch/nn/modules/module.html#Module.get_submodule  # noqa
-# get_submodule was added to nn.Module at v1.9.0
-def get_submodule(model, target):
-    if target == "":
-        return model
-    atoms: List[str] = target.split(".")
-    mod: torch.nn.Module = model
-    for item in atoms:
-        if not hasattr(mod, item):
-            raise AttributeError(
-                mod._get_name() + " has no " "attribute `" + item + "`"
-            )
-        mod = getattr(mod, item)
-        if not isinstance(mod, torch.nn.Module):
-            raise AttributeError("`" + item + "` is not " "an nn.Module")
-    return mod
-
-
-def convert_scaled_to_non_scaled(
-    model: nn.Module,
-    inplace: bool = False,
-):
-    """
-    Args:
-      model:
-        The model to be converted.
-      inplace:
-        If True, the input model is modified inplace.
-        If False, the input model is copied and we modify the copied version.
-    Return:
-      Return a model without scaled layers.
-    """
-    if not inplace:
-        model = copy.deepcopy(model)
-
-    d = {}
-    for name, m in model.named_modules():
-        if isinstance(m, BasicNorm):
-            d[name] = convert_basic_norm(m)
-        elif isinstance(m, (ActivationBalancer, Whiten)):
-            d[name] = nn.Identity()
-
-    for k, v in d.items():
-        if "." in k:
-            parent, child = k.rsplit(".", maxsplit=1)
-            setattr(get_submodule(model, parent), child, v)
-        else:
-            setattr(model, k, v)
-
-    return model
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/streaming_beam_search.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/streaming_beam_search.py
@ -1,282 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import List
-
-import k2
-import torch
-import torch.nn as nn
-from beam_search import Hypothesis, HypothesisList, get_hyps_shape
-from decode_stream import DecodeStream
-
-from icefall.decode import one_best_decoding
-from icefall.utils import get_texts
-
-
-def greedy_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    streams: List[DecodeStream],
-) -> None:
-    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C), where N >= 1.
-      streams:
-        A list of Stream objects.
-    """
-    assert len(streams) == encoder_out.size(0)
-    assert encoder_out.ndim == 3
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-    device = model.device
-    T = encoder_out.size(1)
-
-    decoder_input = torch.tensor(
-        [stream.hyp[-context_size:] for stream in streams],
-        device=device,
-        dtype=torch.int64,
-    )
-    # decoder_out is of shape (N, 1, decoder_out_dim)
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    for t in range(T):
-        # current_encoder_out's shape: (batch_size, 1, encoder_out_dim)
-        current_encoder_out = encoder_out[:, t : t + 1, :]  # noqa
-
-        logits = model.joiner(
-            current_encoder_out.unsqueeze(2),
-            decoder_out.unsqueeze(1),
-            project_input=False,
-        )
-        # logits'shape (batch_size,  vocab_size)
-        logits = logits.squeeze(1).squeeze(1)
-
-        assert logits.ndim == 2, logits.shape
-        y = logits.argmax(dim=1).tolist()
-        emitted = False
-        for i, v in enumerate(y):
-            if v != blank_id:
-                streams[i].hyp.append(v)
-                emitted = True
-        if emitted:
-            # update decoder output
-            decoder_input = torch.tensor(
-                [stream.hyp[-context_size:] for stream in streams],
-                device=device,
-                dtype=torch.int64,
-            )
-            decoder_out = model.decoder(
-                decoder_input,
-                need_pad=False,
-            )
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-
-def modified_beam_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    streams: List[DecodeStream],
-    num_active_paths: int = 4,
-) -> None:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The RNN-T model.
-      encoder_out:
-        A 3-D tensor of shape (N, T, encoder_out_dim) containing the output of
-        the encoder model.
-      streams:
-        A list of stream objects.
-      num_active_paths:
-        Number of active paths during the beam search.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert len(streams) == encoder_out.size(0)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-    batch_size = len(streams)
-    T = encoder_out.size(1)
-
-    B = [stream.hyps for stream in streams]
-
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t].unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.stack(
-            [hyp.log_prob.reshape(1) for hyps in A for hyp in hyps], dim=0
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, decoder_output_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, encoder_out_dim)
-
-        logits = model.joiner(current_encoder_out, decoder_out, project_input=False)
-        # logits is of shape (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)
-
-        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(num_active_paths)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                if new_token != blank_id:
-                    new_ys.append(new_token)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
-                B[i].add(new_hyp)
-
-    for i in range(batch_size):
-        streams[i].hyps = B[i]
-
-
-def fast_beam_search_one_best(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    processed_lens: torch.Tensor,
-    streams: List[DecodeStream],
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-) -> None:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first generated by Fsa-based beam search, then we get the
-    recognition by applying shortest path on the lattice.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      processed_lens:
-        A tensor of shape (N,) containing the number of processed frames
-        in `encoder_out` before padding.
-      streams:
-        A list of stream objects.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-    """
-    assert encoder_out.ndim == 3
-    B, T, C = encoder_out.shape
-    assert B == len(streams)
-
-    context_size = model.decoder.context_size
-    vocab_size = model.decoder.vocab_size
-
-    config = k2.RnntDecodingConfig(
-        vocab_size=vocab_size,
-        decoder_history_len=context_size,
-        beam=beam,
-        max_contexts=max_contexts,
-        max_states=max_states,
-    )
-    individual_streams = []
-    for i in range(B):
-        individual_streams.append(streams[i].rnnt_decoding_stream)
-    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
-
-    for t in range(T):
-        # shape is a RaggedShape of shape (B, context)
-        # contexts is a Tensor of shape (shape.NumElements(), context_size)
-        shape, contexts = decoding_streams.get_contexts()
-        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
-        contexts = contexts.to(torch.int64)
-        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
-        decoder_out = model.decoder(contexts, need_pad=False)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # current_encoder_out is of shape
-        # (shape.NumElements(), 1, joiner_dim)
-        # fmt: off
-        current_encoder_out = torch.index_select(
-            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
-        )
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out.unsqueeze(2),
-            decoder_out.unsqueeze(1),
-            project_input=False,
-        )
-        logits = logits.squeeze(1).squeeze(1)
-        log_probs = logits.log_softmax(dim=-1)
-        decoding_streams.advance(log_probs)
-
-    decoding_streams.terminate_and_flush_to_streams()
-
-    lattice = decoding_streams.format_output(processed_lens.tolist())
-    best_path = one_best_decoding(lattice)
-    hyp_tokens = get_texts(best_path)
-
-    for i in range(B):
-        streams[i].hyp = hyp_tokens[i]
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/streaming_decode.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/streaming_decode.py
@ -1,615 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2022 Xiaomi Corporation (Authors: Wei Kang, Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Usage:
-./pruned_transducer_stateless7_streaming/streaming_decode.py \
-  --epoch 28 \
-  --avg 15 \
-  --decode-chunk-len 32 \
-  --exp-dir ./pruned_transducer_stateless7_streaming/exp \
-  --decoding_method greedy_search \
-  --num-decode-streams 2000
-"""
-
-import argparse
-import logging
-import math
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import numpy as np
-import sentencepiece as spm
-import torch
-import torch.nn as nn
-from asr_datamodule import LibriSpeechAsrDataModule
-from decode_stream import DecodeStream
-from kaldifeat import Fbank, FbankOptions
-from lhotse import CutSet
-from streaming_beam_search import (
-    fast_beam_search_one_best,
-    greedy_search,
-    modified_beam_search,
-)
-from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_transducer_model
-from zipformer import stack_states, unstack_states
-
-from icefall.checkpoint import (
-    average_checkpoints,
-    average_checkpoints_with_averaged_model,
-    find_checkpoints,
-    load_checkpoint,
-)
-from icefall.utils import (
-    AttributeDict,
-    setup_logger,
-    store_transcripts,
-    str2bool,
-    write_error_stats,
-)
-
-LOG_EPS = math.log(1e-10)
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=28,
-        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 0.
-        You can specify --avg to use more checkpoints for model averaging.""",
-    )
-
-    parser.add_argument(
-        "--iter",
-        type=int,
-        default=0,
-        help="""If positive, --epoch is ignored and it
-        will use the checkpoint exp_dir/checkpoint-iter.pt.
-        You can specify --avg to use more checkpoints for model averaging.
-        """,
-    )
-
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=15,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch' and '--iter'",
-    )
-
-    parser.add_argument(
-        "--use-averaged-model",
-        type=str2bool,
-        default=True,
-        help="Whether to load averaged model. Currently it only supports "
-        "using --epoch. If True, it would decode with the averaged model "
-        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
-        "Actually only the models with epoch number of `epoch-avg` and "
-        "`epoch` are loaded for averaging. ",
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="pruned_transducer_stateless2/exp",
-        help="The experiment dir",
-    )
-
-    parser.add_argument(
-        "--bpe-model",
-        type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
-    )
-
-    parser.add_argument(
-        "--decoding-method",
-        type=str,
-        default="greedy_search",
-        help="""Supported decoding methods are:
-        greedy_search
-        modified_beam_search
-        fast_beam_search
-        """,
-    )
-
-    parser.add_argument(
-        "--num_active_paths",
-        type=int,
-        default=4,
-        help="""An interger indicating how many candidates we will keep for each
-        frame. Used only when --decoding-method is modified_beam_search.""",
-    )
-
-    parser.add_argument(
-        "--beam",
-        type=float,
-        default=4,
-        help="""A floating point value to calculate the cutoff score during beam
-        search (i.e., `cutoff = max-score - beam`), which is the same as the
-        `beam` in Kaldi.
-        Used only when --decoding-method is fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-contexts",
-        type=int,
-        default=4,
-        help="""Used only when --decoding-method is
-        fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--max-states",
-        type=int,
-        default=32,
-        help="""Used only when --decoding-method is
-        fast_beam_search""",
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
-    )
-
-    parser.add_argument(
-        "--num-decode-streams",
-        type=int,
-        default=2000,
-        help="The number of streams that can be decoded parallel.",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def decode_one_chunk(
-    params: AttributeDict,
-    model: nn.Module,
-    decode_streams: List[DecodeStream],
-) -> List[int]:
-    """Decode one chunk frames of features for each decode_streams and
-    return the indexes of finished streams in a List.
-
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-      model:
-        The neural model.
-      decode_streams:
-        A List of DecodeStream, each belonging to a utterance.
-    Returns:
-      Return a List containing which DecodeStreams are finished.
-    """
-    device = model.device
-
-    features = []
-    feature_lens = []
-    states = []
-    processed_lens = []
-
-    for stream in decode_streams:
-        feat, feat_len = stream.get_feature_frames(params.decode_chunk_len)
-        features.append(feat)
-        feature_lens.append(feat_len)
-        states.append(stream.states)
-        processed_lens.append(stream.done_frames)
-
-    feature_lens = torch.tensor(feature_lens, device=device)
-    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
-
-    # We subsample features with ((x_len - 7) // 2 + 1) // 2 and the max downsampling
-    # factor in encoders is 8.
-    # After feature embedding (x_len - 7) // 2, we have (23 - 7) // 2 = 8.
-    tail_length = 23
-    if features.size(1) < tail_length:
-        pad_length = tail_length - features.size(1)
-        feature_lens += pad_length
-        features = torch.nn.functional.pad(
-            features,
-            (0, 0, 0, pad_length),
-            mode="constant",
-            value=LOG_EPS,
-        )
-
-    states = stack_states(states)
-    processed_lens = torch.tensor(processed_lens, device=device)
-
-    encoder_out, encoder_out_lens, new_states = model.encoder.streaming_forward(
-        x=features,
-        x_lens=feature_lens,
-        states=states,
-    )
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    if params.decoding_method == "greedy_search":
-        greedy_search(model=model, encoder_out=encoder_out, streams=decode_streams)
-    elif params.decoding_method == "fast_beam_search":
-        processed_lens = processed_lens + encoder_out_lens
-        fast_beam_search_one_best(
-            model=model,
-            encoder_out=encoder_out,
-            processed_lens=processed_lens,
-            streams=decode_streams,
-            beam=params.beam,
-            max_states=params.max_states,
-            max_contexts=params.max_contexts,
-        )
-    elif params.decoding_method == "modified_beam_search":
-        modified_beam_search(
-            model=model,
-            streams=decode_streams,
-            encoder_out=encoder_out,
-            num_active_paths=params.num_active_paths,
-        )
-    else:
-        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
-
-    states = unstack_states(new_states)
-
-    finished_streams = []
-    for i in range(len(decode_streams)):
-        decode_streams[i].states = states[i]
-        decode_streams[i].done_frames += encoder_out_lens[i]
-        if decode_streams[i].done:
-            finished_streams.append(i)
-
-    return finished_streams
-
-
-def decode_dataset(
-    cuts: CutSet,
-    params: AttributeDict,
-    model: nn.Module,
-    sp: spm.SentencePieceProcessor,
-    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      cuts:
-        Lhotse Cutset containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      sp:
-        The BPE model.
-      decoding_graph:
-        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search.
-    Returns:
-      Return a dict, whose key may be "greedy_search" if greedy search
-      is used, or it may be "beam_7" if beam size of 7 is used.
-      Its value is a list of tuples. Each tuple contains two elements:
-      The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    device = model.device
-
-    opts = FbankOptions()
-    opts.device = device
-    opts.frame_opts.dither = 0
-    opts.frame_opts.snip_edges = False
-    opts.frame_opts.samp_freq = 16000
-    opts.mel_opts.num_bins = 80
-
-    log_interval = 50
-
-    decode_results = []
-    # Contain decode streams currently running.
-    decode_streams = []
-    for num, cut in enumerate(cuts):
-        # each utterance has a DecodeStream.
-        initial_states = model.encoder.get_init_state(device=device)
-        decode_stream = DecodeStream(
-            params=params,
-            cut_id=cut.id,
-            initial_states=initial_states,
-            decoding_graph=decoding_graph,
-            device=device,
-        )
-
-        audio: np.ndarray = cut.load_audio()
-        # audio.shape: (1, num_samples)
-        assert len(audio.shape) == 2
-        assert audio.shape[0] == 1, "Should be single channel"
-        assert audio.dtype == np.float32, audio.dtype
-
-        # The trained model is using normalized samples
-        assert audio.max() <= 1, "Should be normalized to [-1, 1])"
-
-        samples = torch.from_numpy(audio).squeeze(0)
-
-        fbank = Fbank(opts)
-        feature = fbank(samples.to(device))
-        decode_stream.set_features(feature, tail_pad_len=params.decode_chunk_len)
-        decode_stream.ground_truth = cut.supervisions[0].text
-
-        decode_streams.append(decode_stream)
-
-        while len(decode_streams) >= params.num_decode_streams:
-            finished_streams = decode_one_chunk(
-                params=params, model=model, decode_streams=decode_streams
-            )
-            for i in sorted(finished_streams, reverse=True):
-                decode_results.append(
-                    (
-                        decode_streams[i].id,
-                        decode_streams[i].ground_truth.split(),
-                        sp.decode(decode_streams[i].decoding_result()).split(),
-                    )
-                )
-                del decode_streams[i]
-
-        if num % log_interval == 0:
-            logging.info(f"Cuts processed until now is {num}.")
-
-    # decode final chunks of last sequences
-    while len(decode_streams):
-        finished_streams = decode_one_chunk(
-            params=params, model=model, decode_streams=decode_streams
-        )
-        for i in sorted(finished_streams, reverse=True):
-            decode_results.append(
-                (
-                    decode_streams[i].id,
-                    decode_streams[i].ground_truth.split(),
-                    sp.decode(decode_streams[i].decoding_result()).split(),
-                )
-            )
-            del decode_streams[i]
-
-    if params.decoding_method == "greedy_search":
-        key = "greedy_search"
-    elif params.decoding_method == "fast_beam_search":
-        key = (
-            f"beam_{params.beam}_"
-            f"max_contexts_{params.max_contexts}_"
-            f"max_states_{params.max_states}"
-        )
-    elif params.decoding_method == "modified_beam_search":
-        key = f"num_active_paths_{params.num_active_paths}"
-    else:
-        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
-    return {key: decode_results}
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
-):
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = (
-            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
-        logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = (
-            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        )
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=True
-            )
-            test_set_wers[key] = wer
-
-        logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = (
-        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
-    )
-    with open(errs_info, "w") as f:
-        print("settings\tWER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-
-    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
-
-    if params.iter > 0:
-        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
-    else:
-        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-
-    # for streaming
-    params.suffix += f"-streaming-chunk-size-{params.decode_chunk_len}"
-
-    # for fast_beam_search
-    if params.decoding_method == "fast_beam_search":
-        params.suffix += f"-beam-{params.beam}"
-        params.suffix += f"-max-contexts-{params.max_contexts}"
-        params.suffix += f"-max-states-{params.max_states}"
-
-    if params.use_averaged_model:
-        params.suffix += "-use-averaged-model"
-
-    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    logging.info(f"Device: {device}")
-
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> and <unk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
-
-    logging.info(params)
-
-    logging.info("About to create model")
-    model = get_transducer_model(params)
-
-    if not params.use_averaged_model:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-        elif params.avg == 1:
-            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-        else:
-            start = params.epoch - params.avg + 1
-            filenames = []
-            for i in range(start, params.epoch + 1):
-                if start >= 0:
-                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-            logging.info(f"averaging {filenames}")
-            model.to(device)
-            model.load_state_dict(average_checkpoints(filenames, device=device))
-    else:
-        if params.iter > 0:
-            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-                : params.avg + 1
-            ]
-            if len(filenames) == 0:
-                raise ValueError(
-                    f"No checkpoints found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            elif len(filenames) < params.avg + 1:
-                raise ValueError(
-                    f"Not enough checkpoints ({len(filenames)}) found for"
-                    f" --iter {params.iter}, --avg {params.avg}"
-                )
-            filename_start = filenames[-1]
-            filename_end = filenames[0]
-            logging.info(
-                "Calculating the averaged model over iteration checkpoints"
-                f" from {filename_start} (excluded) to {filename_end}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-        else:
-            assert params.avg > 0, params.avg
-            start = params.epoch - params.avg
-            assert start >= 1, start
-            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-            logging.info(
-                f"Calculating the averaged model over epoch range from "
-                f"{start} (excluded) to {params.epoch}"
-            )
-            model.to(device)
-            model.load_state_dict(
-                average_checkpoints_with_averaged_model(
-                    filename_start=filename_start,
-                    filename_end=filename_end,
-                    device=device,
-                )
-            )
-
-    model.to(device)
-    model.eval()
-    model.device = device
-
-    decoding_graph = None
-    if params.decoding_method == "fast_beam_search":
-        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    librispeech = LibriSpeechAsrDataModule(args)
-
-    test_clean_cuts = librispeech.test_clean_cuts()
-    test_other_cuts = librispeech.test_other_cuts()
-
-    test_sets = ["test-clean", "test-other"]
-    test_cuts = [test_clean_cuts, test_other_cuts]
-
-    for test_set, test_cut in zip(test_sets, test_cuts):
-        results_dict = decode_dataset(
-            cuts=test_cut,
-            params=params,
-            model=model,
-            sp=sp,
-            decoding_graph=decoding_graph,
-        )
-
-        save_results(
-            params=params,
-            test_set_name=test_set,
-            results_dict=results_dict,
-        )
-
-    logging.info("Done!")
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/test_model.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/test_model.py
@ -1,150 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-To run this file, do:
-
-    cd icefall/egs/librispeech/ASR
-    python ./pruned_transducer_stateless7_streaming/test_model.py
-"""
-
-import torch
-from scaling_converter import convert_scaled_to_non_scaled
-from train import get_params, get_transducer_model
-
-
-def test_model():
-    params = get_params()
-    params.vocab_size = 500
-    params.blank_id = 0
-    params.context_size = 2
-    params.num_encoder_layers = "2,4,3,2,4"
-    params.feedforward_dims = "1024,1024,2048,2048,1024"
-    params.nhead = "8,8,8,8,8"
-    params.encoder_dims = "384,384,384,384,384"
-    params.attention_dims = "192,192,192,192,192"
-    params.encoder_unmasked_dims = "256,256,256,256,256"
-    params.zipformer_downsampling_factors = "1,2,4,8,2"
-    params.cnn_module_kernels = "31,31,31,31,31"
-    params.decoder_dim = 512
-    params.joiner_dim = 512
-    params.num_left_chunks = 4
-    params.short_chunk_size = 50
-    params.decode_chunk_len = 32
-    model = get_transducer_model(params)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    print(f"Number of model parameters: {num_param}")
-
-    # Test jit script
-    convert_scaled_to_non_scaled(model, inplace=True)
-    # We won't use the forward() method of the model in C++, so just ignore
-    # it here.
-    # Otherwise, one of its arguments is a ragged tensor and is not
-    # torch scriptabe.
-    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
-    print("Using torch.jit.script")
-    model = torch.jit.script(model)
-
-
-def test_model_jit_trace():
-    params = get_params()
-    params.vocab_size = 500
-    params.blank_id = 0
-    params.context_size = 2
-    params.num_encoder_layers = "2,4,3,2,4"
-    params.feedforward_dims = "1024,1024,2048,2048,1024"
-    params.nhead = "8,8,8,8,8"
-    params.encoder_dims = "384,384,384,384,384"
-    params.attention_dims = "192,192,192,192,192"
-    params.encoder_unmasked_dims = "256,256,256,256,256"
-    params.zipformer_downsampling_factors = "1,2,4,8,2"
-    params.cnn_module_kernels = "31,31,31,31,31"
-    params.decoder_dim = 512
-    params.joiner_dim = 512
-    params.num_left_chunks = 4
-    params.short_chunk_size = 50
-    params.decode_chunk_len = 32
-    model = get_transducer_model(params)
-    model.eval()
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    print(f"Number of model parameters: {num_param}")
-
-    convert_scaled_to_non_scaled(model, inplace=True)
-
-    # Test encoder
-    def _test_encoder():
-        encoder = model.encoder
-        assert encoder.decode_chunk_size == params.decode_chunk_len // 2, (
-            encoder.decode_chunk_size,
-            params.decode_chunk_len,
-        )
-        T = params.decode_chunk_len + 7
-
-        x = torch.zeros(1, T, 80, dtype=torch.float32)
-        x_lens = torch.full((1,), T, dtype=torch.int32)
-        states = encoder.get_init_state(device=x.device)
-        encoder.__class__.forward = encoder.__class__.streaming_forward
-        traced_encoder = torch.jit.trace(encoder, (x, x_lens, states))
-
-        states1 = encoder.get_init_state(device=x.device)
-        states2 = traced_encoder.get_init_state(device=x.device)
-        for i in range(5):
-            x = torch.randn(1, T, 80, dtype=torch.float32)
-            x_lens = torch.full((1,), T, dtype=torch.int32)
-            y1, _, states1 = encoder.streaming_forward(x, x_lens, states1)
-            y2, _, states2 = traced_encoder(x, x_lens, states2)
-            assert torch.allclose(y1, y2, atol=1e-6), (i, (y1 - y2).abs().mean())
-
-    # Test decoder
-    def _test_decoder():
-        decoder = model.decoder
-        y = torch.zeros(10, decoder.context_size, dtype=torch.int64)
-        need_pad = torch.tensor([False])
-
-        traced_decoder = torch.jit.trace(decoder, (y, need_pad))
-        d1 = decoder(y, need_pad)
-        d2 = traced_decoder(y, need_pad)
-        assert torch.equal(d1, d2), (d1 - d2).abs().mean()
-
-    # Test joiner
-    def _test_joiner():
-        joiner = model.joiner
-        encoder_out_dim = joiner.encoder_proj.weight.shape[1]
-        decoder_out_dim = joiner.decoder_proj.weight.shape[1]
-        encoder_out = torch.rand(1, encoder_out_dim, dtype=torch.float32)
-        decoder_out = torch.rand(1, decoder_out_dim, dtype=torch.float32)
-
-        traced_joiner = torch.jit.trace(joiner, (encoder_out, decoder_out))
-        j1 = joiner(encoder_out, decoder_out)
-        j2 = traced_joiner(encoder_out, decoder_out)
-        assert torch.equal(j1, j2), (j1 - j2).abs().mean()
-
-    _test_encoder()
-    _test_decoder()
-    _test_joiner()
-
-
-def main():
-    test_model()
-    test_model_jit_trace()
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/train.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/train.py
--- a/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/zipformer.py
+++ b/egs/libricss/SURT/dprnn_pruned_transducer_stateless7/zipformer.py
--- a/egs/libricss/SURT/local/compute_fbank_libricss.py
+++ b/egs/libricss/SURT/local/compute_fbank_libricss.py
@ -25,6 +25,7 @@ The generated fbank features are saved in data/fbank.
 import logging
 from pathlib import Path

+import pyloudnorm as pyln
 import torch
 import torch.multiprocessing
 from lhotse import LilcomChunkyWriter, load_manifest_lazy
@ -69,6 +70,11 @@ def compute_fbank_libricss():
        dev_cuts = cuts.filter(lambda c: "session0" in c.id)
        test_cuts = cuts.filter(lambda c: "session0" not in c.id)

+        # If SDM cuts, apply loudness normalization
+        if name == "sdm":
+            dev_cuts = dev_cuts.normalize_loudness(target=-23.0)
+            test_cuts = test_cuts.normalize_loudness(target=-23.0)
+
        logging.info(f"Extracting fbank features for {name} dev cuts")
        _ = dev_cuts.compute_and_store_features_batch(
            extractor=extractor,
--- a/egs/libricss/SURT/local/compute_fbank_librimix.py
+++ b/egs/libricss/SURT/local/compute_fbank_librimix.py
@ -1,115 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This file computes fbank features of the synthetically mixed LibriSpeech
-train and dev sets.
-It looks for manifests in the directory data/manifests.
-
-The generated fbank features are saved in data/fbank.
-"""
-import logging
-from pathlib import Path
-
-import torch
-import torch.multiprocessing
-from lhotse import LilcomChunkyWriter
-from lhotse.features.kaldifeat import (
-    KaldifeatFbank,
-    KaldifeatFbankConfig,
-    KaldifeatFrameOptions,
-    KaldifeatMelOptions,
-)
-from lhotse.recipes.utils import read_manifests_if_cached
-
-# Torch's multithreaded behavior needs to be disabled or
-# it wastes a lot of CPU and slow things down.
-# Do this outside of main() in case it needs to take effect
-# even when we are not invoking the main (e.g. when spawning subprocesses).
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-torch.multiprocessing.set_sharing_strategy("file_system")
-
-
-def compute_fbank_librimix():
-    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
-
-    sampling_rate = 16000
-    num_mel_bins = 80
-
-    extractor = KaldifeatFbank(
-        KaldifeatFbankConfig(
-            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
-            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
-            device="cuda",
-        )
-    )
-
-    logging.info("Reading manifests")
-    manifests = read_manifests_if_cached(
-        dataset_parts=["train_norvb_v1", "dev_norvb_v1"],
-        types=["cuts"],
-        output_dir=src_dir,
-        prefix="libri-mix",
-        suffix="jsonl.gz",
-        lazy=True,
-    )
-
-    train_cuts = manifests["train_norvb_v1"]["cuts"]
-    dev_cuts = manifests["dev_norvb_v1"]["cuts"]
-    # train_2spk_cuts = manifests["train_2spk_norvb"]["cuts"]
-
-    logging.info("Extracting fbank features for training cuts")
-    _ = train_cuts.compute_and_store_features_batch(
-        extractor=extractor,
-        storage_path=output_dir / "librimix_feats_train_norvb_v1",
-        manifest_path=src_dir / "cuts_train_norvb_v1.jsonl.gz",
-        batch_duration=5000,
-        num_workers=4,
-        storage_type=LilcomChunkyWriter,
-        overwrite=True,
-    )
-
-    logging.info("Extracting fbank features for dev cuts")
-    _ = dev_cuts.compute_and_store_features_batch(
-        extractor=extractor,
-        storage_path=output_dir / "librimix_feats_dev_norvb_v1",
-        manifest_path=src_dir / "cuts_dev_norvb_v1.jsonl.gz",
-        batch_duration=5000,
-        num_workers=4,
-        storage_type=LilcomChunkyWriter,
-        overwrite=True,
-    )
-
-    # logging.info("Extracting fbank features for 2-spk train cuts")
-    # _ = train_2spk_cuts.compute_and_store_features_batch(
-    #     extractor=extractor,
-    #     storage_path=output_dir / "librimix_feats_train_2spk_norvb",
-    #     manifest_path=src_dir / "cuts_train_2spk_norvb.jsonl.gz",
-    #     batch_duration=5000,
-    #     num_workers=4,
-    #     storage_type=LilcomChunkyWriter,
-    #     overwrite=True,
-    # )
-
-
-if __name__ == "__main__":
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    compute_fbank_librimix()
--- a/egs/libricss/SURT/local/compute_fbank_librispeech.py
+++ b/egs/libricss/SURT/local/compute_fbank_librispeech.py
@ -25,7 +25,6 @@ The generated fbank features are saved in data/fbank.

 import logging
 from pathlib import Path
-from typing import Optional

 import torch
 from lhotse import CutSet, LilcomChunkyWriter
@ -43,17 +42,17 @@ from lhotse.recipes.utils import read_manifests_if_cached
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")


-def compute_fbank_librispeech(bpe_model: Optional[str] = None):
+def compute_fbank_librispeech():
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_mel_bins = 80

    dataset_parts = (
-        # "dev-clean",
-        # "train-clean-100",
-        # "train-clean-360",
+        "train-clean-100",
+        "train-clean-360",
        "train-other-500",
    )
    prefix = "librispeech"
@ -92,8 +91,7 @@ def compute_fbank_librispeech(bpe_model: Optional[str] = None):
            supervisions=m["supervisions"],
        )

-        if "train" in partition:
-            cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
+        cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)

        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
--- a/egs/libricss/SURT/local/compute_fbank_lsmix.py
+++ b/egs/libricss/SURT/local/compute_fbank_lsmix.py
@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the synthetically mixed LibriSpeech
+train and dev sets.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import random
+import warnings
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import LilcomChunkyWriter, load_manifest
+from lhotse.cut import MixedCut, MixTrack, MultiCut
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+from lhotse.utils import fix_random_seed, uuid4
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_lsmix():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = read_manifests_if_cached(
+        dataset_parts=["train_clean_full", "train_clean_ov40"],
+        types=["cuts"],
+        output_dir=src_dir,
+        prefix="lsmix",
+        suffix="jsonl.gz",
+        lazy=True,
+    )
+
+    cs = {}
+    cs["clean_full"] = manifests["train_clean_full"]["cuts"]
+    cs["clean_ov40"] = manifests["train_clean_ov40"]["cuts"]
+
+    # only uses RIRs and noises from REVERB challenge
+    real_rirs = load_manifest(src_dir / "real-rir_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+    noises = load_manifest(src_dir / "iso-noise_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+
+    # Apply perturbation to the training cuts
+    logging.info("Applying perturbation to the training cuts")
+    cs["rvb_full"] = cs["clean_full"].map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+    cs["rvb_ov40"] = cs["clean_ov40"].map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+
+    for type_affix in ["full", "ov40"]:
+        for rvb_affix in ["clean", "rvb"]:
+            logging.info(
+                f"Extracting fbank features for {type_affix} {rvb_affix} training cuts"
+            )
+            cuts = cs[f"{rvb_affix}_{type_affix}"]
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                _ = cuts.compute_and_store_features_batch(
+                    extractor=extractor,
+                    storage_path=output_dir
+                    / f"lsmix_feats_train_{rvb_affix}_{type_affix}",
+                    manifest_path=src_dir
+                    / f"cuts_train_{rvb_affix}_{type_affix}.jsonl.gz",
+                    batch_duration=5000,
+                    num_workers=4,
+                    storage_type=LilcomChunkyWriter,
+                    overwrite=True,
+                )
+
+
+def augment(cut, perturb_snr=False, rirs=None, noises=None, perturb_loudness=False):
+    """
+    Given a mixed cut, this function optionally applies the following augmentations:
+    - Perturbing the SNRs of the tracks (in range [-5, 5] dB)
+    - Reverberation using a randomly selected RIR
+    - Adding noise
+    - Perturbing the loudness (in range [-20, -25] dB)
+    """
+    out_cut = cut.drop_features()
+
+    # Perturb the SNRs (optional)
+    if perturb_snr:
+        snrs = [random.uniform(-5, 5) for _ in range(len(cut.tracks))]
+        for i, (track, snr) in enumerate(zip(out_cut.tracks, snrs)):
+            if i == 0:
+                # Skip the first track since it is the reference
+                continue
+            track.snr = snr
+
+    # Reverberate the cut (optional)
+    if rirs is not None:
+        # Select an RIR at random
+        rir = random.choice(rirs)
+        # Select a channel at random
+        rir_channel = random.choice(list(range(rir.num_channels)))
+        # Reverberate the cut
+        out_cut = out_cut.reverb_rir(rir_recording=rir, rir_channels=[rir_channel])
+
+    # Add noise (optional)
+    if noises is not None:
+        # Select a noise recording at random
+        noise = random.choice(noises).to_cut()
+        if isinstance(noise, MultiCut):
+            noise = noise.to_mono()[0]
+        # Select an SNR at random
+        snr = random.uniform(10, 30)
+        # Repeat the noise to match the duration of the cut
+        noise = repeat_cut(noise, out_cut.duration)
+        out_cut = MixedCut(
+            id=out_cut.id,
+            tracks=[
+                MixTrack(cut=out_cut, type="MixedCut"),
+                MixTrack(cut=noise, type="DataCut", snr=snr),
+            ],
+        )
+
+    # Perturb the loudness (optional)
+    if perturb_loudness:
+        target_loudness = random.uniform(-20, -25)
+        out_cut = out_cut.normalize_loudness(target_loudness, mix_first=True)
+    return out_cut
+
+
+def repeat_cut(cut, duration):
+    while cut.duration < duration:
+        cut = cut.mix(cut, offset_other_by=cut.duration)
+    return cut.truncate(duration=duration)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    fix_random_seed(42)
+    compute_fbank_lsmix()
--- a/egs/libricss/SURT/prepare.sh
+++ b/egs/libricss/SURT/prepare.sh
@ -4,7 +4,6 @@ set -eou pipefail

 stage=-1
 stop_stage=100
-use_gss=true  # Use GSS-based enhancement with MDM setting

 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
@ -24,8 +23,10 @@ use_gss=true  # Use GSS-based enhancement with MDM setting
 #     - noise
 #     - speech
 #
+#  - $dl_dir/rirs_noises
+#      This directory contains the RIRS_NOISES corpus downloaded from https://openslr.org/28/.
+#
 dl_dir=$PWD/download
-cmd="queue-freegpu.pl --config conf/gpu.conf --gpu 1 --mem 4G"

 . shared/parse_options.sh || exit 1

@ -71,6 +72,15 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  if [ ! -d $dl_dir/musan ]; then
    lhotse download musan $dl_dir
  fi
+
+  # If you have pre-downloaded it to /path/to/rirs_noises,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/rirs_noises $dl_dir/
+  #
+  if [ ! -d $dl_dir/rirs_noises ]; then
+    lhotse download rirs_noises $dl_dir
+  fi
 fi

 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
@ -94,123 +104,101 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
 fi

 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-  log "Stage 3: Prepare musan manifest"
+  log "Stage 3: Prepare musan manifest and RIRs"
  # We assume that you have downloaded the musan corpus
  # to $dl_dir/musan
  mkdir -p data/manifests
  lhotse prepare musan $dl_dir/musan data/manifests
+
+  # We assume that you have downloaded the RIRS_NOISES corpus
+  # to $dl_dir/rirs_noises
+  lhotse prepare rir-noise -p real_rir -p iso_noise $dl_dir/rirs_noises data/manifests
 fi

 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Extract features for LibriSpeech, trim to alignments, and shuffle the cuts"
-  $cmd exp/extract_libri_fbank.log python local/compute_fbank_librispeech.py
+  python local/compute_fbank_librispeech.py
  lhotse combine data/manifests/librispeech_cuts_train* - |\
    lhotse cut trim-to-alignments --type word --max-pause 0.2 - - |\
    shuf | gzip -c > data/manifests/librispeech_cuts_train_trimmed.jsonl.gz
-  lhotse cut trim-to-alignments --type word --max-pause 0.2 data/manifests/librispeech_cuts_dev-clean.jsonl.gz - |\
-    shuf | gzip -c > data/manifests/librispeech_cuts_dev_trimmed.jsonl.gz
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Create simulated mixtures from LibriSpeech (train and dev). This may take a while."
-  # We create a 2-speaker set which will be used during the model warmup phase, and a
-  # full training set (2,3,4 speakers) that will be used for the subsequent training.
-  # We create anechoic and reverberant versions of both sets. For the full set, we compute
-  # silence and overlap distributions based on LibriCSS sessions (no 0L).
-
-  sim_cmd="queue.pl --mem 16G -l 'num_proc=4,h_rt=600:00:00'"
+  # We create a high overlap set which will be used during the model warmup phase, and a
+  # full training set that will be used for the subsequent training.

  gunzip -c data/manifests/libricss-sdm_supervisions_all.jsonl.gz |\
    grep -v "0L" | grep -v "OV10" |\
    gzip -c > data/manifests/libricss-sdm_supervisions_all_v1.jsonl.gz

-  # 2-speaker anechoic
-  # log "Generating 2-speaker anechoic training set"
-  # $sim_cmd exp/sim_train_2spk.log lhotse workflows simulate-meetings \
-  #   --method conversational \
-  #   --prob-diff-spk-overlap 1.0 \
-  #   --num-meetings 50000 \
-  #   --num-speakers-per-meeting 2 \
-  #   --max-duration-per-speaker 20.0 \
-  #   --max-utterances-per-speaker 1 \
-  #   --seed 1234 \
-  #   --num-jobs 4 \
-  #   data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
-  #   data/manifests/libri-mix_cuts_train_2spk_norvb.jsonl.gz
+  gunzip -c data/manifests/libricss-sdm_supervisions_all.jsonl.gz |\
+    grep "OV40" |\
+    gzip -c > data/manifests/libricss-sdm_supervisions_ov40.jsonl.gz

-  # 2-speaker reverberant
-  # log "Generating 2-speaker reverberant training set"
-  # lhotse workflows simulate-meetings \
-  #   --method conversational \
-  #   --prob-diff-spk-overlap 1.0 \
-  #   --num-meetings 50000 \
-  #   --num-speakers-per-meeting 2 \
-  #   --max-duration-per-speaker 20.0 \
-  #   --max-utterances-per-speaker 1 \
-  #   --seed 1234 \
-  #   --reverberate \
-  #   --num-jobs 4 \
-  #   data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
-  #   data/manifests/libri-mix_cuts_train_2spk_rvb.jsonl.gz
+  # Warmup mixtures (100k) based on high overlap (OV40)
+  log "Generating 100k anechoic train mixtures for warmup"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --fit-to-supervisions data/manifests/libricss-sdm_supervisions_ov40.jsonl.gz \
+    --num-meetings 100000 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 4 \
+    data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
+    data/manifests/lsmix_cuts_train_clean_ov40.jsonl.gz

  # Full training set (2,3 speakers) anechoic
-  for part in dev train; do
-    if [ $part == "dev" ]; then
-      num_jobs=1
-    else
-      num_jobs=4
-    fi
-    log "Generating anechoic ${part} set (full)"
-    $sim_cmd exp/sim_${part}.log lhotse workflows simulate-meetings \
-      --method conversational \
-      --fit-to-supervisions data/manifests/libricss-sdm_supervisions_all_v1.jsonl.gz \
-      --num-repeats 1 \
-      --num-speakers-per-meeting 2,3 \
-      --max-duration-per-speaker 15.0 \
-      --max-utterances-per-speaker 3 \
-      --seed 1234 \
-      --num-jobs ${num_jobs} \
-      data/manifests/librispeech_cuts_${part}_trimmed.jsonl.gz \
-      data/manifests/libri-mix_cuts_${part}_norvb_v1.jsonl.gz
-  done
-
-  # Full training set (2,3,4 speakers) reverberant
-  # for part in dev train; do
-  #   log "Generating reverberant ${part} set (full)" ``
-  #   lhotse workflows simulate-meetings \
-  #     --method conversational \
-  #     --num-repeats 1 \
-  #     --num-speakers-per-meeting 2,3,4 \
-  #     --max-duration-per-speaker 20.0 \
-  #     --max-utterances-per-speaker 5 \
-  #     --seed 1234 \
-  #     --reverberate \
-  #     data/manifests/librispeech_cuts_${part}_trimmed.jsonl.gz \
-  #     data/manifests/libri-mix_cuts_${part}_rvb.jsonl.gz
-  # done
+  log "Generating anechoic ${part} set (full)"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --fit-to-supervisions data/manifests/libricss-sdm_supervisions_all_v1.jsonl.gz \
+    --num-repeats 1 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 4 \
+    data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
+    data/manifests/lsmix_cuts_train_clean_full.jsonl.gz
 fi

 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Compute fbank features for musan"
  mkdir -p data/fbank
-  $cmd exp/feats_musan.log python local/compute_fbank_musan.py
+  python local/compute_fbank_musan.py
 fi

 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Compute fbank features for simulated Libri-mix"
  mkdir -p data/fbank
-  $cmd exp/feats_librimix_norvb_v1.log python local/compute_fbank_librimix.py
+  python local/compute_fbank_lsmix.py
 fi

 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
-  log "Stage 8: Compute fbank features for LibriCSS"
-  mkdir -p data/fbank
-  $cmd exp/feats_libricss.log python local/compute_fbank_libricss.py
+  log "Stage 8: Add source feats to mixtures (useful for auxiliary tasks)"
+  python local/add_source_feats.py
+
+  log "Combining lsmix-clean and lsmix-rvb"
+  for type in full ov40; do
+    cat <(gunzip -c data/manifests/cuts_train_clean_${type}_sources.jsonl.gz) \
+      <(gunzip -c data/manifests/cuts_train_rvb_${type}_sources.jsonl.gz) |\
+      shuf | gzip -c > data/manifests/cuts_train_${type}_sources.jsonl.gz
+  done
 fi

 if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
-  log "Stage 9: Download LibriSpeech BPE model from HuggingFace."
-  mkdir -p data/lang_bpe_500 && pushd data/lang_bpe_500
+  log "Stage 9: Compute fbank features for LibriCSS"
+  mkdir -p data/fbank
+  python local/compute_fbank_libricss.py
+fi
+
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  log "Stage 10: Download LibriSpeech BPE model from HuggingFace."
+  mkdir -p data/lang_bpe_500
+  pushd data/lang_bpe_500
  wget https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/resolve/main/data/lang_bpe_500/bpe.model
  popd
 fi