Merge aeba8b505cd923181410c9ffcac6836f3d469e89 into 34fc1fdf0d8ff520e2bb18267d046ca207c78ef9

2025-12-11 06:55:27 +00:00 · 2025-08-05 04:08:02 +00:00 · 2025-08-05 04:08:02 +00:00 · fa84791095
commit fa84791095
parent 34fc1fdf0d aeba8b505c
33 changed files with 5601 additions and 0 deletions
--- a/egs/mls_english/ASR/README.md
+++ b/egs/mls_english/ASR/README.md
@ -0,0 +1,19 @@
+# Introduction
+
+
+
+**Multilingual LibriSpeech (MLS)** is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish. It includes about 44.5K hours of English and a total of about 6K hours for other languages. This icefall training recipe was created for the restructured version of the English split of the dataset available on Hugging Face below.
+
+
+
+The dataset is available on Hugging Face. For more details, please visit:
+
+- Dataset: https://huggingface.co/datasets/parler-tts/mls_eng
+- Original MLS dataset link: https://www.openslr.org/94
+
+
+## On-the-fly feature computation
+
+This recipe currently only supports on-the-fly feature bank computation, since `lhotse` manifests and feature banks are not pre-calculated in this recipe. This should mean that the dataset can be streamed from Hugging Face, but we have not tested this yet. We may add a version that supports pre-calculating features to better match existing recipes.
+
+<!-- [./RESULTS.md](./RESULTS.md) contains the latest results. -->
--- a/egs/mls_english/ASR/local/train_bpe_model.py
+++ b/egs/mls_english/ASR/local/train_bpe_model.py
@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+# Copyright    2024  Xiaomi Corp.        (authors: Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# You can install sentencepiece via:
+#
+#  pip install sentencepiece
+#
+# Due to an issue reported in
+# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
+#
+# Please install a version >=0.1.96
+
+import argparse
+import shutil
+from pathlib import Path
+
+import sentencepiece as spm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        The generated bpe.model is saved to this directory.
+        """,
+    )
+
+    parser.add_argument(
+        "--byte-fallback",
+        action="store_true",
+        help="""Whether to enable byte_fallback when training bpe.""",
+    )
+
+    parser.add_argument(
+        "--character-coverage",
+        type=float,
+        default=1.0,
+        help="Character coverage in vocabulary.",
+    )
+
+    parser.add_argument(
+        "--transcript",
+        type=str,
+        help="Training transcript.",
+    )
+
+    parser.add_argument(
+        "--vocab-size",
+        type=int,
+        help="Vocabulary size for BPE training",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    vocab_size = args.vocab_size
+    lang_dir = Path(args.lang_dir)
+
+    model_type = "bpe"
+
+    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
+    train_text = args.transcript
+    input_sentence_size = 100000000
+
+    user_defined_symbols = ["<blk>", "<sos/eos>"]
+    unk_id = len(user_defined_symbols)
+    # Note: unk_id is fixed to 2.
+    # If you change it, you should also change other
+    # places that are using it.
+
+    model_file = Path(model_prefix + ".model")
+    if not model_file.is_file():
+        spm.SentencePieceTrainer.train(
+            input=train_text,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            model_prefix=model_prefix,
+            input_sentence_size=input_sentence_size,
+            character_coverage=args.character_coverage,
+            user_defined_symbols=user_defined_symbols,
+            byte_fallback=args.byte_fallback,
+            unk_id=unk_id,
+            bos_id=-1,
+            eos_id=-1,
+        )
+    else:
+        print(f"{model_file} exists - skipping")
+        return
+
+    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/mls_english/ASR/local/utils/asr_datamodule.py
+++ b/egs/mls_english/ASR/local/utils/asr_datamodule.py
@ -0,0 +1,253 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import inspect
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from lhotse import CutSet, Fbank, FbankConfig
+from lhotse.dataset import (
+    CutConcatenate,
+    DynamicBucketingSampler,
+    K2SpeechRecognitionDataset,
+    SimpleCutSampler,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class MLSEnglishHFAsrDataModule:
+    """
+    DataModule for MLS English ASR experiments using HuggingFace dataset.
+    Handles dataset loading and provides train/valid/test dataloaders with
+    on-the-fly feature extraction.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+        self.dataset = None
+
+    #     self._validate_args()
+
+    # def _validate_args(self) -> None:
+    #     """Validate configuration arguments."""
+    #     if self.args.on_the_fly_feats is False:
+    #         raise ValueError("This recipe requires on-the-fly feature extraction")
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="Options for data loading and processing",
+        )
+
+        # Dataset configuration
+        group.add_argument(
+            "--dataset-path",
+            type=str,
+            default="parler-tts/mls_eng",
+            help="Path to HuggingFace MLS English dataset (name or local path)",
+        )
+
+        # Sampling and batching
+        group.add_argument(
+            "--max-duration",
+            type=float,
+            default=200.0,
+            help="Maximum batch duration in seconds",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="Whether to use bucketing sampler",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="Number of buckets for DynamicBucketingSampler",
+        )
+
+        # Data augmentation
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="Whether to enable SpecAugment",
+        )
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Time warp factor for SpecAugment",
+        )
+
+        # Dataloader configuration
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="Number of workers for data loading",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=False,
+            help="Whether to return cuts in batch",
+        )
+
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last incomplete batch",
+        )
+
+        return parser
+
+    def load_dataset(self, dataset_path: Optional[str] = None) -> None:
+        """Load the HuggingFace dataset."""
+        dataset_path = dataset_path or self.args.dataset_path
+        logging.info(f"Loading MLS English dataset from: {dataset_path}")
+
+        try:
+            from datasets import load_dataset
+
+            self.dataset = load_dataset(dataset_path)
+            logging.info("Dataset loaded successfully")
+        except ImportError:
+            raise ImportError("Please install datasets package: pip install datasets")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load dataset: {e}")
+
+    def _create_dataset(
+        self, cuts: CutSet, is_train: bool = False
+    ) -> K2SpeechRecognitionDataset:
+        """Create appropriate dataset with transforms."""
+        transforms = []
+        input_transforms = []
+
+        if is_train and self.args.enable_spec_aug:
+            input_transforms.append(self._create_spec_augment())
+
+        return K2SpeechRecognitionDataset(
+            cut_transforms=transforms,
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+        )
+
+    def _create_spec_augment(self) -> SpecAugment:
+        """Create SpecAugment transform based on config."""
+        num_frame_masks = 10
+        num_frame_masks_parameter = inspect.signature(SpecAugment.__init__).parameters[
+            "num_frame_masks"
+        ]
+        if num_frame_masks_parameter.default == 1:
+            num_frame_masks = 2
+
+        return SpecAugment(
+            time_warp_factor=self.args.spec_aug_time_warp_factor,
+            num_frame_masks=num_frame_masks,
+            features_mask_size=27,
+            num_feature_masks=2,
+            frames_mask_size=100,
+        )
+
+    def _create_sampler(
+        self, cuts: CutSet, shuffle: bool
+    ) -> Union[DynamicBucketingSampler, SimpleCutSampler]:
+        """Create appropriate sampler based on config."""
+        if self.args.bucketing_sampler:
+            return DynamicBucketingSampler(
+                cuts,
+                max_duration=self.args.max_duration,
+                shuffle=shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        return SimpleCutSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=shuffle,
+        )
+
+    def train_dataloader(
+        self, sampler_state_dict: Optional[Dict[str, Any]] = None
+    ) -> DataLoader:
+        """Create train dataloader."""
+        cuts = self.train_cuts()
+        dataset = self._create_dataset(cuts, is_train=True)
+        sampler = self._create_sampler(cuts, shuffle=True)
+
+        if sampler_state_dict:
+            sampler.load_state_dict(sampler_state_dict)
+
+        return DataLoader(
+            dataset,
+            sampler=sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+        )
+
+    def valid_dataloader(self) -> DataLoader:
+        """Create validation dataloader."""
+        cuts = self.valid_cuts()
+        return DataLoader(
+            self._create_dataset(cuts),
+            sampler=self._create_sampler(cuts, shuffle=False),
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+    def test_dataloader(self) -> DataLoader:
+        """Create test dataloader."""
+        cuts = self.test_cuts()
+        return DataLoader(
+            self._create_dataset(cuts),
+            sampler=self._create_sampler(cuts, shuffle=False),
+            batch_size=None,
+            num_workers=self.args.num_workers,
+        )
+
+    @lru_cache()
+    def train_cuts(self) -> CutSet:
+        return CutSet.from_huggingface_dataset(
+            self.dataset["train"], text_key="transcript"
+        )
+
+    @lru_cache()
+    def valid_cuts(self) -> CutSet:
+        return CutSet.from_huggingface_dataset(
+            self.dataset["dev"], text_key="transcript"
+        )
+
+    @lru_cache()
+    def test_cuts(self) -> CutSet:
+        return CutSet.from_huggingface_dataset(
+            self.dataset["test"], text_key="transcript"
+        )
--- a/egs/mls_english/ASR/local/utils/compute_fbank_mls_english.py
+++ b/egs/mls_english/ASR/local/utils/compute_fbank_mls_english.py
@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# Copyright    2023  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import List, Tuple
+
+import torch
+
+# fmt: off
+from lhotse import (  # See the following for why LilcomChunkyWriter is preferred; https://github.com/k2-fsa/icefall/pull/404; https://github.com/lhotse-speech/lhotse/pull/527
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    RecordingSet,
+    SupervisionSet,
+)
+from lhotse.utils import is_module_available
+
+# fmt: on
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+RNG_SEED = 42
+concat_params = {"gap": 1.0, "maxlen": 10.0}
+
+
+def make_cutset_blueprints(
+    mls_eng_hf_dataset_path: str = "parler-tts/mls_eng",
+) -> List[Tuple[str, CutSet]]:
+    cut_sets = []
+
+    if not is_module_available("datasets"):
+        raise ImportError(
+            "To process the MLS English HF corpus, please install optional dependency: pip install datasets"
+        )
+
+    from datasets import load_dataset
+
+    dataset = load_dataset(mls_eng_hf_dataset_path)
+
+    # Create test dataset
+    logging.info("Creating test cuts.")
+    cut_sets.append(
+        (
+            "test",
+            CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript"),
+        )
+    )
+
+    # Create dev dataset
+    logging.info("Creating dev cuts.")
+    cut_sets.append(
+        ("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript"))
+    )
+
+    # Create train dataset
+    logging.info("Creating train cuts.")
+    cut_sets.append(
+        (
+            "train",
+            CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript"),
+        )
+    )
+    return cut_sets
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("-m", "--manifest-dir", type=Path)
+    parser.add_argument("-a", "--audio-dir", type=Path)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    extractor = Fbank(FbankConfig(num_mel_bins=80))
+    num_jobs = min(16, os.cpu_count())
+
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    if (args.manifest_dir / ".mls-eng-fbank.done").exists():
+        logging.info(
+            "Previous fbank computed for MLS English found. "
+            f"Delete {args.manifest_dir / '.mls-eng-fbank.done'} to allow recomputing fbank."
+        )
+        return
+    else:
+        mls_eng_hf_dataset_path = "/root/datasets/parler-tts--mls_eng"
+        cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
+        for part, cut_set in cut_sets:
+            logging.info(f"Processing {part}")
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                num_jobs=num_jobs,
+                storage_path=(args.manifest_dir / f"feats_{part}").as_posix(),
+                storage_type=LilcomChunkyWriter,
+            )
+
+            # cut_set.save_audios(args.audio_dir)
+            # cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
+
+        logging.info("All fbank computed for MLS English.")
+        (args.manifest_dir / ".mls-eng-fbank.done").touch()
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/mls_english/ASR/local/utils/generate_transcript.py
+++ b/egs/mls_english/ASR/local/utils/generate_transcript.py
@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# Copyright    2022  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Optional
+
+from lhotse import CutSet
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="Generate transcripts for BPE training from MLS English dataset",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default="parler-tts/mls_eng",
+        help="Path to HuggingFace MLS English dataset (name or local path)",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default=Path("data/lang"),
+        help="Directory to store output transcripts",
+    )
+
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="train",
+        help="Dataset split to use for generating transcripts (train/dev/test)",
+    )
+
+    return parser.parse_args()
+
+
+def generate_transcript_from_cuts(cuts: CutSet, output_file: Path) -> None:
+    """Generate transcript text file from Lhotse CutSet."""
+    with open(output_file, "w") as f:
+        for cut in tqdm(cuts, desc="Processing cuts"):
+            for sup in cut.supervisions:
+                f.write(f"{sup.text}\n")
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+        level=logging.INFO,
+    )
+
+    args.lang_dir.mkdir(parents=True, exist_ok=True)
+    output_file = args.lang_dir / "transcript.txt"
+
+    logging.info(f"Loading {args.split} split from dataset: {args.dataset_path}")
+    try:
+        cuts = CutSet.from_huggingface_dataset(
+            args.dataset_path, split=args.split, text_key="transcript"
+        )
+    except Exception as e:
+        logging.error(f"Failed to load dataset: {e}")
+        raise
+
+    logging.info(f"Generating transcript to {output_file}")
+    generate_transcript_from_cuts(cuts, output_file)
+    logging.info("Transcript generation completed")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/mls_english/ASR/prepare.sh
+++ b/egs/mls_english/ASR/prepare.sh
@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+
+# Prepare script for MLS English ASR recipe in icefall
+# This recipe uses on-the-fly feature extraction, so it skips manifest
+# and feature generation steps used in other recipes.
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+set -eou pipefail
+
+nj=15
+stage=-1
+stop_stage=100
+
+# Configuration for BPE tokenizer
+vocab_sizes=(2000)  # You can add more sizes like (500 1000 2000) for comparison
+
+# Directory where dataset will be downloaded
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+mkdir -p data
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "Starting MLS English data preparation"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download MLS English dataset"
+  if [ ! -d $dl_dir/mls_english ]; then
+    if ! git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_english; then
+      log "Failed to download MLS English dataset"
+      exit 1
+    fi
+  fi
+fi
+
+mkdir -p data/lang
+lang_dir=data/lang
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare transcript for BPE training"
+  if [ ! -f $lang_dir/transcript.txt ]; then
+    log "Generating transcripts for BPE training"
+    ./local/utils/generate_transcript.py --lang-dir $lang_dir
+  fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare BPE tokenizer"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    log "Training BPE model with vocab_size=${vocab_size}"
+    bpe_dir=data/lang/bpe_${vocab_size}
+    mkdir -p $bpe_dir
+
+    if [ ! -f $bpe_dir/bpe.model ]; then
+      ./local/train_bpe_model.py \
+        --lang-dir $bpe_dir \
+        --vocab-size $vocab_size \
+        --transcript $lang_dir/transcript.txt
+    fi
+  done
+fi
+
+log "MLS English data preparation completed successfully"
--- a/egs/mls_english/ASR/zipformer/asr_datamodule.py
+++ b/egs/mls_english/ASR/zipformer/asr_datamodule.py
@ -0,0 +1 @@
+../local/utils/asr_datamodule.py
--- a/egs/mls_english/ASR/zipformer/beam_search.py
+++ b/egs/mls_english/ASR/zipformer/beam_search.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/beam_search.py
--- a/egs/mls_english/ASR/zipformer/ctc_decode.py
+++ b/egs/mls_english/ASR/zipformer/ctc_decode.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/ctc_decode.py
--- a/egs/mls_english/ASR/zipformer/decode.py
+++ b/egs/mls_english/ASR/zipformer/decode.py
--- a/egs/mls_english/ASR/zipformer/decode_stream.py
+++ b/egs/mls_english/ASR/zipformer/decode_stream.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/decode_stream.py
--- a/egs/mls_english/ASR/zipformer/decoder.py
+++ b/egs/mls_english/ASR/zipformer/decoder.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/decoder.py
--- a/egs/mls_english/ASR/zipformer/do_not_use_it_directly.py
+++ b/egs/mls_english/ASR/zipformer/do_not_use_it_directly.py
--- a/egs/mls_english/ASR/zipformer/encoder_interface.py
+++ b/egs/mls_english/ASR/zipformer/encoder_interface.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/encoder_interface.py
--- a/egs/mls_english/ASR/zipformer/export-onnx.py
+++ b/egs/mls_english/ASR/zipformer/export-onnx.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/export-onnx.py
--- a/egs/mls_english/ASR/zipformer/export.py
+++ b/egs/mls_english/ASR/zipformer/export.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/export.py
--- a/egs/mls_english/ASR/zipformer/generate_averaged_model.py
+++ b/egs/mls_english/ASR/zipformer/generate_averaged_model.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/generate_averaged_model.py
--- a/egs/mls_english/ASR/zipformer/joiner.py
+++ b/egs/mls_english/ASR/zipformer/joiner.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/joiner.py
--- a/egs/mls_english/ASR/zipformer/model.py
+++ b/egs/mls_english/ASR/zipformer/model.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/model.py
--- a/egs/mls_english/ASR/zipformer/my_profile.py
+++ b/egs/mls_english/ASR/zipformer/my_profile.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/my_profile.py
--- a/egs/mls_english/ASR/zipformer/onnx_pretrained.py
+++ b/egs/mls_english/ASR/zipformer/onnx_pretrained.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/onnx_pretrained.py
--- a/egs/mls_english/ASR/zipformer/optim.py
+++ b/egs/mls_english/ASR/zipformer/optim.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/optim.py
--- a/egs/mls_english/ASR/zipformer/pretrained.py
+++ b/egs/mls_english/ASR/zipformer/pretrained.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/pretrained.py
--- a/egs/mls_english/ASR/zipformer/scaling.py
+++ b/egs/mls_english/ASR/zipformer/scaling.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling.py
--- a/egs/mls_english/ASR/zipformer/scaling_converter.py
+++ b/egs/mls_english/ASR/zipformer/scaling_converter.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling_converter.py
--- a/egs/mls_english/ASR/zipformer/streaming_beam_search.py
+++ b/egs/mls_english/ASR/zipformer/streaming_beam_search.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/streaming_beam_search.py
--- a/egs/mls_english/ASR/zipformer/streaming_decode.py
+++ b/egs/mls_english/ASR/zipformer/streaming_decode.py
@ -0,0 +1,900 @@
+#!/usr/bin/env python3
+# Copyright 2022-2023 Xiaomi Corporation (Authors: Wei Kang,
+#                                                  Fangjun Kuang,
+#                                                  Zengwei Yao)
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage:
+./zipformer/streaming_decode.py--epoch 28   --avg 15   --causal 1   --chunk-size 32   --left-context-frames 256   --exp-dir ./zipformer/exp-large --lang data/lang_char --num-encoder-layers 2,2,4,5,4,2 --feedforward-dim 512,768,1536,2048,1536,768 --encoder-dim 192,256,512,768,512,256 --encoder-unmasked-dim 192,192,256,320,256,192
+
+"""
+
+import argparse
+import logging
+import math
+import os
+import pdb
+import subprocess as sp
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import numpy as np
+import torch
+from asr_datamodule import ReazonSpeechAsrDataModule
+from decode_stream import DecodeStream
+from kaldifeat import Fbank, FbankOptions
+from lhotse import CutSet
+from streaming_beam_search import (
+    fast_beam_search_one_best,
+    greedy_search,
+    modified_beam_search,
+)
+from tokenizer import Tokenizer
+from torch import Tensor, nn
+from torch.nn.utils.rnn import pad_sequence
+from train import add_model_arguments, get_model, get_params
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.utils import (
+    AttributeDict,
+    make_pad_mask,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=28,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_char",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Supported decoding methods are:
+        greedy_search
+        modified_beam_search
+        fast_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--num_active_paths",
+        type=int,
+        default=4,
+        help="""An interger indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=32,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--num-decode-streams",
+        type=int,
+        default=2000,
+        help="The number of streams that can be decoded parallel.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_init_states(
+    model: nn.Module,
+    batch_size: int = 1,
+    device: torch.device = torch.device("cpu"),
+) -> List[torch.Tensor]:
+    """
+    Returns a list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
+    is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
+    states[-2] is the cached left padding for ConvNeXt module,
+    of shape (batch_size, num_channels, left_pad, num_freqs)
+    states[-1] is processed_lens of shape (batch,), which records the number
+    of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
+    """
+    states = model.encoder.get_init_states(batch_size, device)
+
+    embed_states = model.encoder_embed.get_init_states(batch_size, device)
+    states.append(embed_states)
+
+    processed_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+    states.append(processed_lens)
+
+    return states
+
+
+def stack_states(state_list: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+    """Stack list of zipformer states that correspond to separate utterances
+    into a single emformer state, so that it can be used as an input for
+    zipformer when those utterances are formed into a batch.
+
+    Args:
+      state_list:
+        Each element in state_list corresponding to the internal state
+        of the zipformer model for a single utterance. For element-n,
+        state_list[n] is a list of cached tensors of all encoder layers. For layer-i,
+        state_list[n][i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1,
+        cached_val2, cached_conv1, cached_conv2).
+        state_list[n][-2] is the cached left padding for ConvNeXt module,
+          of shape (batch_size, num_channels, left_pad, num_freqs)
+        state_list[n][-1] is processed_lens of shape (batch,), which records the number
+        of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
+
+    Note:
+      It is the inverse of :func:`unstack_states`.
+    """
+    batch_size = len(state_list)
+    assert (len(state_list[0]) - 2) % 6 == 0, len(state_list[0])
+    tot_num_layers = (len(state_list[0]) - 2) // 6
+
+    batch_states = []
+    for layer in range(tot_num_layers):
+        layer_offset = layer * 6
+        # cached_key: (left_context_len, batch_size, key_dim)
+        cached_key = torch.cat(
+            [state_list[i][layer_offset] for i in range(batch_size)], dim=1
+        )
+        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
+        cached_nonlin_attn = torch.cat(
+            [state_list[i][layer_offset + 1] for i in range(batch_size)], dim=1
+        )
+        # cached_val1: (left_context_len, batch_size, value_dim)
+        cached_val1 = torch.cat(
+            [state_list[i][layer_offset + 2] for i in range(batch_size)], dim=1
+        )
+        # cached_val2: (left_context_len, batch_size, value_dim)
+        cached_val2 = torch.cat(
+            [state_list[i][layer_offset + 3] for i in range(batch_size)], dim=1
+        )
+        # cached_conv1: (#batch, channels, left_pad)
+        cached_conv1 = torch.cat(
+            [state_list[i][layer_offset + 4] for i in range(batch_size)], dim=0
+        )
+        # cached_conv2: (#batch, channels, left_pad)
+        cached_conv2 = torch.cat(
+            [state_list[i][layer_offset + 5] for i in range(batch_size)], dim=0
+        )
+        batch_states += [
+            cached_key,
+            cached_nonlin_attn,
+            cached_val1,
+            cached_val2,
+            cached_conv1,
+            cached_conv2,
+        ]
+
+    cached_embed_left_pad = torch.cat(
+        [state_list[i][-2] for i in range(batch_size)], dim=0
+    )
+    batch_states.append(cached_embed_left_pad)
+
+    processed_lens = torch.cat([state_list[i][-1] for i in range(batch_size)], dim=0)
+    batch_states.append(processed_lens)
+
+    return batch_states
+
+
+def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
+    """Unstack the zipformer state corresponding to a batch of utterances
+    into a list of states, where the i-th entry is the state from the i-th
+    utterance in the batch.
+
+    Note:
+      It is the inverse of :func:`stack_states`.
+
+    Args:
+        batch_states: A list of cached tensors of all encoder layers. For layer-i,
+          states[i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1, cached_val2,
+          cached_conv1, cached_conv2).
+          state_list[-2] is the cached left padding for ConvNeXt module,
+          of shape (batch_size, num_channels, left_pad, num_freqs)
+          states[-1] is processed_lens of shape (batch,), which records the number
+          of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
+
+    Returns:
+        state_list: A list of list. Each element in state_list corresponding to the internal state
+        of the zipformer model for a single utterance.
+    """
+    assert (len(batch_states) - 2) % 6 == 0, len(batch_states)
+    tot_num_layers = (len(batch_states) - 2) // 6
+
+    processed_lens = batch_states[-1]
+    batch_size = processed_lens.shape[0]
+
+    state_list = [[] for _ in range(batch_size)]
+
+    for layer in range(tot_num_layers):
+        layer_offset = layer * 6
+        # cached_key: (left_context_len, batch_size, key_dim)
+        cached_key_list = batch_states[layer_offset].chunk(chunks=batch_size, dim=1)
+        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
+        cached_nonlin_attn_list = batch_states[layer_offset + 1].chunk(
+            chunks=batch_size, dim=1
+        )
+        # cached_val1: (left_context_len, batch_size, value_dim)
+        cached_val1_list = batch_states[layer_offset + 2].chunk(
+            chunks=batch_size, dim=1
+        )
+        # cached_val2: (left_context_len, batch_size, value_dim)
+        cached_val2_list = batch_states[layer_offset + 3].chunk(
+            chunks=batch_size, dim=1
+        )
+        # cached_conv1: (#batch, channels, left_pad)
+        cached_conv1_list = batch_states[layer_offset + 4].chunk(
+            chunks=batch_size, dim=0
+        )
+        # cached_conv2: (#batch, channels, left_pad)
+        cached_conv2_list = batch_states[layer_offset + 5].chunk(
+            chunks=batch_size, dim=0
+        )
+        for i in range(batch_size):
+            state_list[i] += [
+                cached_key_list[i],
+                cached_nonlin_attn_list[i],
+                cached_val1_list[i],
+                cached_val2_list[i],
+                cached_conv1_list[i],
+                cached_conv2_list[i],
+            ]
+
+    cached_embed_left_pad_list = batch_states[-2].chunk(chunks=batch_size, dim=0)
+    for i in range(batch_size):
+        state_list[i].append(cached_embed_left_pad_list[i])
+
+    processed_lens_list = batch_states[-1].chunk(chunks=batch_size, dim=0)
+    for i in range(batch_size):
+        state_list[i].append(processed_lens_list[i])
+
+    return state_list
+
+
+def streaming_forward(
+    features: Tensor,
+    feature_lens: Tensor,
+    model: nn.Module,
+    states: List[Tensor],
+    chunk_size: int,
+    left_context_len: int,
+) -> Tuple[Tensor, Tensor, List[Tensor]]:
+    """
+    Returns encoder outputs, output lengths, and updated states.
+    """
+    cached_embed_left_pad = states[-2]
+    (x, x_lens, new_cached_embed_left_pad,) = model.encoder_embed.streaming_forward(
+        x=features,
+        x_lens=feature_lens,
+        cached_left_pad=cached_embed_left_pad,
+    )
+    assert x.size(1) == chunk_size, (x.size(1), chunk_size)
+
+    src_key_padding_mask = make_pad_mask(x_lens)
+
+    # processed_mask is used to mask out initial states
+    processed_mask = torch.arange(left_context_len, device=x.device).expand(
+        x.size(0), left_context_len
+    )
+    processed_lens = states[-1]  # (batch,)
+    # (batch, left_context_size)
+    processed_mask = (processed_lens.unsqueeze(1) <= processed_mask).flip(1)
+    # Update processed lengths
+    new_processed_lens = processed_lens + x_lens
+
+    # (batch, left_context_size + chunk_size)
+    src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)
+
+    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+    encoder_states = states[:-2]
+    (
+        encoder_out,
+        encoder_out_lens,
+        new_encoder_states,
+    ) = model.encoder.streaming_forward(
+        x=x,
+        x_lens=x_lens,
+        states=encoder_states,
+        src_key_padding_mask=src_key_padding_mask,
+    )
+    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+    new_states = new_encoder_states + [
+        new_cached_embed_left_pad,
+        new_processed_lens,
+    ]
+    return encoder_out, encoder_out_lens, new_states
+
+
+def decode_one_chunk(
+    params: AttributeDict,
+    model: nn.Module,
+    decode_streams: List[DecodeStream],
+) -> List[int]:
+    """Decode one chunk frames of features for each decode_streams and
+    return the indexes of finished streams in a List.
+
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      decode_streams:
+        A List of DecodeStream, each belonging to a utterance.
+    Returns:
+      Return a List containing which DecodeStreams are finished.
+    """
+    # pdb.set_trace()
+    # print(model)
+    # print(model.device)
+    # device = model.device
+    chunk_size = int(params.chunk_size)
+    left_context_len = int(params.left_context_frames)
+
+    features = []
+    feature_lens = []
+    states = []
+    processed_lens = []  # Used in fast-beam-search
+
+    for stream in decode_streams:
+        feat, feat_len = stream.get_feature_frames(chunk_size * 2)
+        features.append(feat)
+        feature_lens.append(feat_len)
+        states.append(stream.states)
+        processed_lens.append(stream.done_frames)
+
+    feature_lens = torch.tensor(feature_lens, device=model.device)
+    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
+
+    # Make sure the length after encoder_embed is at least 1.
+    # The encoder_embed subsample features (T - 7) // 2
+    # The ConvNeXt module needs (7 - 1) // 2 = 3 frames of right padding after subsampling
+    tail_length = chunk_size * 2 + 7 + 2 * 3
+    if features.size(1) < tail_length:
+        pad_length = tail_length - features.size(1)
+        feature_lens += pad_length
+        features = torch.nn.functional.pad(
+            features,
+            (0, 0, 0, pad_length),
+            mode="constant",
+            value=LOG_EPS,
+        )
+
+    states = stack_states(states)
+
+    encoder_out, encoder_out_lens, new_states = streaming_forward(
+        features=features,
+        feature_lens=feature_lens,
+        model=model,
+        states=states,
+        chunk_size=chunk_size,
+        left_context_len=left_context_len,
+    )
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    if params.decoding_method == "greedy_search":
+        greedy_search(model=model, encoder_out=encoder_out, streams=decode_streams)
+    elif params.decoding_method == "fast_beam_search":
+        processed_lens = torch.tensor(processed_lens, device=model.device)
+        processed_lens = processed_lens + encoder_out_lens
+        fast_beam_search_one_best(
+            model=model,
+            encoder_out=encoder_out,
+            processed_lens=processed_lens,
+            streams=decode_streams,
+            beam=params.beam,
+            max_states=params.max_states,
+            max_contexts=params.max_contexts,
+        )
+    elif params.decoding_method == "modified_beam_search":
+        modified_beam_search(
+            model=model,
+            streams=decode_streams,
+            encoder_out=encoder_out,
+            num_active_paths=params.num_active_paths,
+        )
+    else:
+        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
+
+    states = unstack_states(new_states)
+
+    finished_streams = []
+    for i in range(len(decode_streams)):
+        decode_streams[i].states = states[i]
+        decode_streams[i].done_frames += encoder_out_lens[i]
+        # if decode_streams[i].done:
+        # finished_streams.append(i)
+        finished_streams.append(i)
+
+    return finished_streams
+
+
+def decode_dataset(
+    cuts: CutSet,
+    params: AttributeDict,
+    model: nn.Module,
+    tokenizer: Tokenizer,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      cuts:
+        Lhotse Cutset containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      tokenizer:
+        The BPE model.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    device = model.device
+
+    opts = FbankOptions()
+    opts.device = device
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = 16000
+    opts.mel_opts.num_bins = 80
+
+    log_interval = 100
+
+    decode_results = []
+    # Contain decode streams currently running.
+    decode_streams = []
+    for num, cut in enumerate(cuts):
+        # each utterance has a DecodeStream.
+        initial_states = get_init_states(model=model, batch_size=1, device=device)
+        decode_stream = DecodeStream(
+            params=params,
+            cut_id=cut.id,
+            initial_states=initial_states,
+            decoding_graph=decoding_graph,
+            device=device,
+        )
+
+        audio: np.ndarray = cut.load_audio()
+        # audio.shape: (1, num_samples)
+        assert len(audio.shape) == 2
+        assert audio.shape[0] == 1, "Should be single channel"
+        assert audio.dtype == np.float32, audio.dtype
+
+        # The trained model is using normalized samples
+        # - this is to avoid sending [-32k,+32k] signal in...
+        # - some lhotse AudioTransform classes can make the signal
+        #   be out of range [-1, 1], hence the tolerance 10
+        assert (
+            np.abs(audio).max() <= 10
+        ), "Should be normalized to [-1, 1], 10 for tolerance..."
+
+        samples = torch.from_numpy(audio).squeeze(0)
+
+        fbank = Fbank(opts)
+        feature = fbank(samples.to(device))
+        decode_stream.set_features(feature, tail_pad_len=30)
+        decode_stream.ground_truth = cut.supervisions[0].text
+        decode_streams.append(decode_stream)
+
+        while len(decode_streams) >= params.num_decode_streams:
+            finished_streams = decode_one_chunk(
+                params=params, model=model, decode_streams=decode_streams
+            )
+            for i in sorted(finished_streams, reverse=True):
+                decode_results.append(
+                    (
+                        decode_streams[i].id,
+                        decode_streams[i].ground_truth.split(),
+                        tokenizer.decode(decode_streams[i].decoding_result()).split(),
+                    )
+                )
+                del decode_streams[i]
+
+        if num % log_interval == 0:
+            logging.info(f"Cuts processed until now is {num}.")
+
+    # decode final chunks of last sequences
+    while len(decode_streams):
+        # print("INSIDE LEN DECODE STREAMS")
+        # pdb.set_trace()
+        # print(model.device)
+        # test_device = model.device
+        # print("done")
+        finished_streams = decode_one_chunk(
+            params=params, model=model, decode_streams=decode_streams
+        )
+        # print('INSIDE FOR LOOP ')
+        # print(finished_streams)
+
+        if not finished_streams:
+            print("No finished streams, breaking the loop")
+            break
+
+        for i in sorted(finished_streams, reverse=True):
+            try:
+                decode_results.append(
+                    (
+                        decode_streams[i].id,
+                        decode_streams[i].ground_truth.split(),
+                        tokenizer.decode(decode_streams[i].decoding_result()).split(),
+                    )
+                )
+                del decode_streams[i]
+            except IndexError as e:
+                print(f"IndexError: {e}")
+                print(f"decode_streams length: {len(decode_streams)}")
+                print(f"finished_streams: {finished_streams}")
+                print(f"i: {i}")
+                continue
+
+    if params.decoding_method == "greedy_search":
+        key = "greedy_search"
+    elif params.decoding_method == "fast_beam_search":
+        key = (
+            f"beam_{params.beam}_"
+            f"max_contexts_{params.max_contexts}_"
+            f"max_states_{params.max_states}"
+        )
+    elif params.decoding_method == "modified_beam_search":
+        key = f"num_active_paths_{params.num_active_paths}"
+    else:
+        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
+    torch.cuda.synchronize()
+    return {key: decode_results}
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    ReazonSpeechAsrDataModule.add_arguments(parser)
+    Tokenizer.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    assert params.causal, params.causal
+    assert "," not in params.chunk_size, "chunk_size should be one value in decoding."
+    assert (
+        "," not in params.left_context_frames
+    ), "left_context_frames should be one value in decoding."
+    params.suffix += f"-chunk-{params.chunk_size}"
+    params.suffix += f"-left-context-{params.left_context_frames}"
+
+    # for fast_beam_search
+    if params.decoding_method == "fast_beam_search":
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp_token = Tokenizer.load(params.lang, params.lang_type)
+
+    # <blk> and <unk> is defined in local/train_bpe_model.py
+    params.blank_id = sp_token.piece_to_id("<blk>")
+    params.unk_id = sp_token.piece_to_id("<unk>")
+    params.vocab_size = sp_token.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if start >= 0:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+    model.device = device
+
+    decoding_graph = None
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    reazonspeech_corpus = ReazonSpeechAsrDataModule(args)
+
+    valid_cuts = reazonspeech_corpus.valid_cuts()
+    test_cuts = reazonspeech_corpus.test_cuts()
+
+    test_sets = ["valid", "test"]
+    test_cuts = [valid_cuts, test_cuts]
+
+    for test_set, test_cut in zip(test_sets, test_cuts):
+        results_dict = decode_dataset(
+            cuts=test_cut,
+            params=params,
+            model=model,
+            tokenizer=sp_token,
+            decoding_graph=decoding_graph,
+        )
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    # valid_cuts = reazonspeech_corpus.valid_cuts()
+
+    # for valid_cut in valid_cuts:
+    #     results_dict = decode_dataset(
+    #         cuts=valid_cut,
+    #         params=params,
+    #         model=model,
+    #         sp=sp,
+    #         decoding_graph=decoding_graph,
+    #     )
+    #     save_results(
+    #         params=params,
+    #         test_set_name="valid",
+    #         results_dict=results_dict,
+    #     )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/mls_english/ASR/zipformer/subsampling.py
+++ b/egs/mls_english/ASR/zipformer/subsampling.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/subsampling.py
--- a/egs/mls_english/ASR/zipformer/test_scaling.py
+++ b/egs/mls_english/ASR/zipformer/test_scaling.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/test_scaling.py
--- a/egs/mls_english/ASR/zipformer/test_subsampling.py
+++ b/egs/mls_english/ASR/zipformer/test_subsampling.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/test_subsampling.py
--- a/egs/mls_english/ASR/zipformer/tokenizer.py
+++ b/egs/mls_english/ASR/zipformer/tokenizer.py
@ -0,0 +1,252 @@
+import argparse
+from pathlib import Path
+from typing import Callable, List, Union
+
+import sentencepiece as spm
+from k2 import SymbolTable
+
+
+class Tokenizer:
+    text2word: Callable[[str], List[str]]
+
+    @staticmethod
+    def add_arguments(parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(title="Lang related options")
+        group.add_argument("--lang", type=Path, help="Path to lang directory.")
+
+        group.add_argument(
+            "--lang-type",
+            type=str,
+            default=None,
+            help=(
+                "Either 'bpe' or 'char'. If not provided, it expects lang_dir/lang_type to exists. "
+                "Note: 'bpe' directly loads sentencepiece.SentencePieceProcessor"
+            ),
+        )
+
+    @staticmethod
+    def Load(lang_dir: Path, lang_type="", oov="<unk>"):
+
+        if not lang_type:
+            assert (lang_dir / "lang_type").exists(), "lang_type not specified."
+            lang_type = (lang_dir / "lang_type").read_text().strip()
+
+        tokenizer = None
+
+        if lang_type == "bpe":
+            assert (
+                lang_dir / "bpe.model"
+            ).exists(), f"No BPE .model could be found in {lang_dir}."
+            tokenizer = spm.SentencePieceProcessor()
+            tokenizer.Load(str(lang_dir / "bpe.model"))
+        elif lang_type == "char":
+            tokenizer = CharTokenizer(lang_dir, oov=oov)
+        else:
+            raise NotImplementedError(f"{lang_type} not supported at the moment.")
+
+        return tokenizer
+
+    load = Load
+
+    def PieceToId(self, piece: str) -> int:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    piece_to_id = PieceToId
+
+    def IdToPiece(self, id: int) -> str:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    id_to_piece = IdToPiece
+
+    def GetPieceSize(self) -> int:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    get_piece_size = GetPieceSize
+
+    def __len__(self) -> int:
+        return self.get_piece_size()
+
+    def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    def EncodeAsIds(self, input: str) -> List[int]:
+        return self.EncodeAsIdsBatch([input])[0]
+
+    def EncodeAsPieces(self, input: str) -> List[str]:
+        return self.EncodeAsPiecesBatch([input])[0]
+
+    def Encode(
+        self, input: Union[str, List[str]], out_type=int
+    ) -> Union[List, List[List]]:
+        if not input:
+            return []
+
+        if isinstance(input, list):
+            if out_type is int:
+                return self.EncodeAsIdsBatch(input)
+            if out_type is str:
+                return self.EncodeAsPiecesBatch(input)
+
+        if out_type is int:
+            return self.EncodeAsIds(input)
+        if out_type is str:
+            return self.EncodeAsPieces(input)
+
+    encode = Encode
+
+    def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    def DecodeIds(self, input: List[int]) -> str:
+        return self.DecodeIdsBatch([input])[0]
+
+    def DecodePieces(self, input: List[str]) -> str:
+        return self.DecodePiecesBatch([input])[0]
+
+    def Decode(
+        self,
+        input: Union[int, List[int], List[str], List[List[int]], List[List[str]]],
+    ) -> Union[List[str], str]:
+
+        if not input:
+            return ""
+
+        if isinstance(input, int):
+            return self.id_to_piece(input)
+        elif isinstance(input, str):
+            raise TypeError(
+                "Unlike spm.SentencePieceProcessor, cannot decode from type str."
+            )
+
+        if isinstance(input[0], list):
+            if not input[0] or isinstance(input[0][0], int):
+                return self.DecodeIdsBatch(input)
+
+            if isinstance(input[0][0], str):
+                return self.DecodePiecesBatch(input)
+
+        if isinstance(input[0], int):
+            return self.DecodeIds(input)
+        if isinstance(input[0], str):
+            return self.DecodePieces(input)
+
+        raise RuntimeError("Unknown input type")
+
+    decode = Decode
+
+    def SplitBatch(self, input: List[str]) -> List[List[str]]:
+        raise NotImplementedError(
+            "You need to implement this function in the child class."
+        )
+
+    def Split(self, input: Union[List[str], str]) -> Union[List[List[str]], List[str]]:
+        if isinstance(input, list):
+            return self.SplitBatch(input)
+        elif isinstance(input, str):
+            return self.SplitBatch([input])[0]
+        raise RuntimeError("Unknown input type")
+
+    split = Split
+
+
+class CharTokenizer(Tokenizer):
+    def __init__(self, lang_dir: Path, oov="<unk>", sep=""):
+        assert (
+            lang_dir / "tokens.txt"
+        ).exists(), f"tokens.txt could not be found in {lang_dir}."
+        token_table = SymbolTable.from_file(lang_dir / "tokens.txt")
+        assert (
+            "#0" not in token_table
+        ), "This tokenizer does not support disambig symbols."
+        self._id2sym = token_table._id2sym
+        self._sym2id = token_table._sym2id
+        self.oov = oov
+        self.oov_id = self._sym2id[oov]
+        self.sep = sep
+        if self.sep:
+            self.text2word = lambda x: x.split(self.sep)
+        else:
+            self.text2word = lambda x: list(x.replace(" ", ""))
+
+    def piece_to_id(self, piece: str) -> int:
+        try:
+            return self._sym2id[piece]
+        except KeyError:
+            return self.oov_id
+
+    def id_to_piece(self, id: int) -> str:
+        return self._id2sym[id]
+
+    def get_piece_size(self) -> int:
+        return len(self._sym2id)
+
+    def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
+        return [[self.piece_to_id(i) for i in self.text2word(text)] for text in input]
+
+    def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
+        return [
+            [i if i in self._sym2id else self.oov for i in self.text2word(text)]
+            for text in input
+        ]
+
+    def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
+        return [self.sep.join(self.id_to_piece(i) for i in text) for text in input]
+
+    def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
+        return [self.sep.join(text) for text in input]
+
+    def SplitBatch(self, input: List[str]) -> List[List[str]]:
+        return [self.text2word(text) for text in input]
+
+
+def test_CharTokenizer():
+    test_single_string = "こんにちは"
+    test_multiple_string = [
+        "今日はいい天気ですよね",
+        "諏訪湖は綺麗でしょう",
+        "这在词表外",
+        "分かち 書き に し た 文章 です",
+        "",
+    ]
+    test_empty_string = ""
+    sp = Tokenizer.load(Path("lang_char"), "char", oov="<unk>")
+    splitter = sp.split
+    print(sp.encode(test_single_string, out_type=str))
+    print(sp.encode(test_single_string, out_type=int))
+    print(sp.encode(test_multiple_string, out_type=str))
+    print(sp.encode(test_multiple_string, out_type=int))
+    print(sp.encode(test_empty_string, out_type=str))
+    print(sp.encode(test_empty_string, out_type=int))
+    print(sp.decode(sp.encode(test_single_string, out_type=str)))
+    print(sp.decode(sp.encode(test_single_string, out_type=int)))
+    print(sp.decode(sp.encode(test_multiple_string, out_type=str)))
+    print(sp.decode(sp.encode(test_multiple_string, out_type=int)))
+    print(sp.decode(sp.encode(test_empty_string, out_type=str)))
+    print(sp.decode(sp.encode(test_empty_string, out_type=int)))
+    print(splitter(test_single_string))
+    print(splitter(test_multiple_string))
+    print(splitter(test_empty_string))
+
+
+if __name__ == "__main__":
+    test_CharTokenizer()
--- a/egs/mls_english/ASR/zipformer/train.py
+++ b/egs/mls_english/ASR/zipformer/train.py
--- a/egs/mls_english/ASR/zipformer/zipformer.py
+++ b/egs/mls_english/ASR/zipformer/zipformer.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/zipformer.py
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/beam_search.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/ctc_decode.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/decode_stream.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/encoder_interface.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/export-onnx.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/generate_averaged_model.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/joiner.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/my_profile.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/onnx_pretrained.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/zipformer/pretrained.py`