Merge aeba8b505cd923181410c9ffcac6836f3d469e89 into 34fc1fdf0d8ff520e2bb18267d046ca207c78ef9

2025-12-11 06:55:27 +00:00 · 2025-08-05 04:08:02 +00:00 · 2025-08-05 04:08:02 +00:00 · fa84791095
commit fa84791095
parent 34fc1fdf0d aeba8b505c
33 changed files with 5601 additions and 0 deletions
--- a/egs/mls_english/ASR/README.md
+++ b/egs/mls_english/ASR/README.md
@ -0,0 +1,19 @@
 # Introduction
 **Multilingual LibriSpeech (MLS)** is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish. It includes about 44.5K hours of English and a total of about 6K hours for other languages. This icefall training recipe was created for the restructured version of the English split of the dataset available on Hugging Face below.
 The dataset is available on Hugging Face. For more details, please visit:
 - Dataset: https://huggingface.co/datasets/parler-tts/mls_eng
 - Original MLS dataset link: https://www.openslr.org/94
 ## On-the-fly feature computation
 This recipe currently only supports on-the-fly feature bank computation, since `lhotse` manifests and feature banks are not pre-calculated in this recipe. This should mean that the dataset can be streamed from Hugging Face, but we have not tested this yet. We may add a version that supports pre-calculating features to better match existing recipes.
 <!-- [./RESULTS.md](./RESULTS.md) contains the latest results. -->
--- a/egs/mls_english/ASR/local/train_bpe_model.py
+++ b/egs/mls_english/ASR/local/train_bpe_model.py
@ -0,0 +1,114 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 # Copyright    2024  Xiaomi Corp.        (authors: Xiaoyu Yang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # You can install sentencepiece via:
 #
 #  pip install sentencepiece
 #
 # Due to an issue reported in
 # https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
 #
 # Please install a version >=0.1.96
 import argparse
 import shutil
 from pathlib import Path
 import sentencepiece as spm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        The generated bpe.model is saved to this directory.
        """,
    )
    parser.add_argument(
        "--byte-fallback",
        action="store_true",
        help="""Whether to enable byte_fallback when training bpe.""",
    )
    parser.add_argument(
        "--character-coverage",
        type=float,
        default=1.0,
        help="Character coverage in vocabulary.",
    )
    parser.add_argument(
        "--transcript",
        type=str,
        help="Training transcript.",
    )
    parser.add_argument(
        "--vocab-size",
        type=int,
        help="Vocabulary size for BPE training",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    vocab_size = args.vocab_size
    lang_dir = Path(args.lang_dir)
    model_type = "bpe"
    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
    train_text = args.transcript
    input_sentence_size = 100000000
    user_defined_symbols = ["<blk>", "<sos/eos>"]
    unk_id = len(user_defined_symbols)
    # Note: unk_id is fixed to 2.
    # If you change it, you should also change other
    # places that are using it.
    model_file = Path(model_prefix + ".model")
    if not model_file.is_file():
        spm.SentencePieceTrainer.train(
            input=train_text,
            vocab_size=vocab_size,
            model_type=model_type,
            model_prefix=model_prefix,
            input_sentence_size=input_sentence_size,
            character_coverage=args.character_coverage,
            user_defined_symbols=user_defined_symbols,
            byte_fallback=args.byte_fallback,
            unk_id=unk_id,
            bos_id=-1,
            eos_id=-1,
        )
    else:
        print(f"{model_file} exists - skipping")
        return
    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
 if __name__ == "__main__":
    main()
--- a/egs/mls_english/ASR/local/utils/asr_datamodule.py
+++ b/egs/mls_english/ASR/local/utils/asr_datamodule.py
@ -0,0 +1,253 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
    CutConcatenate,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    SimpleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 class MLSEnglishHFAsrDataModule:
    """
    DataModule for MLS English ASR experiments using HuggingFace dataset.
    Handles dataset loading and provides train/valid/test dataloaders with
    on-the-fly feature extraction.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
        self.dataset = None
    #     self._validate_args()
    # def _validate_args(self) -> None:
    #     """Validate configuration arguments."""
    #     if self.args.on_the_fly_feats is False:
    #         raise ValueError("This recipe requires on-the-fly feature extraction")
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        group = parser.add_argument_group(
            title="ASR data related options",
            description="Options for data loading and processing",
        )
        # Dataset configuration
        group.add_argument(
            "--dataset-path",
            type=str,
            default="parler-tts/mls_eng",
            help="Path to HuggingFace MLS English dataset (name or local path)",
        )
        # Sampling and batching
        group.add_argument(
            "--max-duration",
            type=float,
            default=200.0,
            help="Maximum batch duration in seconds",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="Whether to use bucketing sampler",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="Number of buckets for DynamicBucketingSampler",
        )
        # Data augmentation
        group.add_argument(
            "--enable-spec-aug",
            type=str2bool,
            default=True,
            help="Whether to enable SpecAugment",
        )
        group.add_argument(
            "--spec-aug-time-warp-factor",
            type=int,
            default=80,
            help="Time warp factor for SpecAugment",
        )
        # Dataloader configuration
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="Number of workers for data loading",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=False,
            help="Whether to return cuts in batch",
        )
        group.add_argument(
            "--drop-last",
            type=str2bool,
            default=True,
            help="Whether to drop last incomplete batch",
        )
        return parser
    def load_dataset(self, dataset_path: Optional[str] = None) -> None:
        """Load the HuggingFace dataset."""
        dataset_path = dataset_path or self.args.dataset_path
        logging.info(f"Loading MLS English dataset from: {dataset_path}")
        try:
            from datasets import load_dataset
            self.dataset = load_dataset(dataset_path)
            logging.info("Dataset loaded successfully")
        except ImportError:
            raise ImportError("Please install datasets package: pip install datasets")
        except Exception as e:
            raise RuntimeError(f"Failed to load dataset: {e}")
    def _create_dataset(
        self, cuts: CutSet, is_train: bool = False
    ) -> K2SpeechRecognitionDataset:
        """Create appropriate dataset with transforms."""
        transforms = []
        input_transforms = []
        if is_train and self.args.enable_spec_aug:
            input_transforms.append(self._create_spec_augment())
        return K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
    def _create_spec_augment(self) -> SpecAugment:
        """Create SpecAugment transform based on config."""
        num_frame_masks = 10
        num_frame_masks_parameter = inspect.signature(SpecAugment.__init__).parameters[
            "num_frame_masks"
        ]
        if num_frame_masks_parameter.default == 1:
            num_frame_masks = 2
        return SpecAugment(
            time_warp_factor=self.args.spec_aug_time_warp_factor,
            num_frame_masks=num_frame_masks,
            features_mask_size=27,
            num_feature_masks=2,
            frames_mask_size=100,
        )
    def _create_sampler(
        self, cuts: CutSet, shuffle: bool
    ) -> Union[DynamicBucketingSampler, SimpleCutSampler]:
        """Create appropriate sampler based on config."""
        if self.args.bucketing_sampler:
            return DynamicBucketingSampler(
                cuts,
                max_duration=self.args.max_duration,
                shuffle=shuffle,
                num_buckets=self.args.num_buckets,
                drop_last=self.args.drop_last,
            )
        return SimpleCutSampler(
            cuts,
            max_duration=self.args.max_duration,
            shuffle=shuffle,
        )
    def train_dataloader(
        self, sampler_state_dict: Optional[Dict[str, Any]] = None
    ) -> DataLoader:
        """Create train dataloader."""
        cuts = self.train_cuts()
        dataset = self._create_dataset(cuts, is_train=True)
        sampler = self._create_sampler(cuts, shuffle=True)
        if sampler_state_dict:
            sampler.load_state_dict(sampler_state_dict)
        return DataLoader(
            dataset,
            sampler=sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
        )
    def valid_dataloader(self) -> DataLoader:
        """Create validation dataloader."""
        cuts = self.valid_cuts()
        return DataLoader(
            self._create_dataset(cuts),
            sampler=self._create_sampler(cuts, shuffle=False),
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
    def test_dataloader(self) -> DataLoader:
        """Create test dataloader."""
        cuts = self.test_cuts()
        return DataLoader(
            self._create_dataset(cuts),
            sampler=self._create_sampler(cuts, shuffle=False),
            batch_size=None,
            num_workers=self.args.num_workers,
        )
    @lru_cache()
    def train_cuts(self) -> CutSet:
        return CutSet.from_huggingface_dataset(
            self.dataset["train"], text_key="transcript"
        )
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        return CutSet.from_huggingface_dataset(
            self.dataset["dev"], text_key="transcript"
        )
    @lru_cache()
    def test_cuts(self) -> CutSet:
        return CutSet.from_huggingface_dataset(
            self.dataset["test"], text_key="transcript"
        )
--- a/egs/mls_english/ASR/local/utils/compute_fbank_mls_english.py
+++ b/egs/mls_english/ASR/local/utils/compute_fbank_mls_english.py
@ -0,0 +1,136 @@
 #!/usr/bin/env python3
 # Copyright    2023  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 import os
 from pathlib import Path
 from typing import List, Tuple
 import torch
 # fmt: off
 from lhotse import (  # See the following for why LilcomChunkyWriter is preferred; https://github.com/k2-fsa/icefall/pull/404; https://github.com/lhotse-speech/lhotse/pull/527
    CutSet,
    Fbank,
    FbankConfig,
    LilcomChunkyWriter,
    RecordingSet,
    SupervisionSet,
 )
 from lhotse.utils import is_module_available
 # fmt: on
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 RNG_SEED = 42
 concat_params = {"gap": 1.0, "maxlen": 10.0}
 def make_cutset_blueprints(
    mls_eng_hf_dataset_path: str = "parler-tts/mls_eng",
 ) -> List[Tuple[str, CutSet]]:
    cut_sets = []
    if not is_module_available("datasets"):
        raise ImportError(
            "To process the MLS English HF corpus, please install optional dependency: pip install datasets"
        )
    from datasets import load_dataset
    dataset = load_dataset(mls_eng_hf_dataset_path)
    # Create test dataset
    logging.info("Creating test cuts.")
    cut_sets.append(
        (
            "test",
            CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript"),
        )
    )
    # Create dev dataset
    logging.info("Creating dev cuts.")
    cut_sets.append(
        ("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript"))
    )
    # Create train dataset
    logging.info("Creating train cuts.")
    cut_sets.append(
        (
            "train",
            CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript"),
        )
    )
    return cut_sets
 def get_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("-m", "--manifest-dir", type=Path)
    parser.add_argument("-a", "--audio-dir", type=Path)
    return parser.parse_args()
 def main():
    args = get_args()
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    num_jobs = min(16, os.cpu_count())
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    if (args.manifest_dir / ".mls-eng-fbank.done").exists():
        logging.info(
            "Previous fbank computed for MLS English found. "
            f"Delete {args.manifest_dir / '.mls-eng-fbank.done'} to allow recomputing fbank."
        )
        return
    else:
        mls_eng_hf_dataset_path = "/root/datasets/parler-tts--mls_eng"
        cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
        for part, cut_set in cut_sets:
            logging.info(f"Processing {part}")
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                num_jobs=num_jobs,
                storage_path=(args.manifest_dir / f"feats_{part}").as_posix(),
                storage_type=LilcomChunkyWriter,
            )
            # cut_set.save_audios(args.audio_dir)
            # cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
        logging.info("All fbank computed for MLS English.")
        (args.manifest_dir / ".mls-eng-fbank.done").touch()
 if __name__ == "__main__":
    main()
--- a/egs/mls_english/ASR/local/utils/generate_transcript.py
+++ b/egs/mls_english/ASR/local/utils/generate_transcript.py
@ -0,0 +1,91 @@
 #!/usr/bin/env python3
 # Copyright    2022  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from pathlib import Path
 from typing import Optional
 from lhotse import CutSet
 from tqdm import tqdm
 def get_args():
    parser = argparse.ArgumentParser(
        description="Generate transcripts for BPE training from MLS English dataset",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--dataset-path",
        type=str,
        default="parler-tts/mls_eng",
        help="Path to HuggingFace MLS English dataset (name or local path)",
    )
    parser.add_argument(
        "--lang-dir",
        type=Path,
        default=Path("data/lang"),
        help="Directory to store output transcripts",
    )
    parser.add_argument(
        "--split",
        type=str,
        default="train",
        help="Dataset split to use for generating transcripts (train/dev/test)",
    )
    return parser.parse_args()
 def generate_transcript_from_cuts(cuts: CutSet, output_file: Path) -> None:
    """Generate transcript text file from Lhotse CutSet."""
    with open(output_file, "w") as f:
        for cut in tqdm(cuts, desc="Processing cuts"):
            for sup in cut.supervisions:
                f.write(f"{sup.text}\n")
 def main():
    args = get_args()
    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
        level=logging.INFO,
    )
    args.lang_dir.mkdir(parents=True, exist_ok=True)
    output_file = args.lang_dir / "transcript.txt"
    logging.info(f"Loading {args.split} split from dataset: {args.dataset_path}")
    try:
        cuts = CutSet.from_huggingface_dataset(
            args.dataset_path, split=args.split, text_key="transcript"
        )
    except Exception as e:
        logging.error(f"Failed to load dataset: {e}")
        raise
    logging.info(f"Generating transcript to {output_file}")
    generate_transcript_from_cuts(cuts, output_file)
    logging.info("Transcript generation completed")
 if __name__ == "__main__":
    main()
--- a/egs/mls_english/ASR/prepare.sh
+++ b/egs/mls_english/ASR/prepare.sh
@ -0,0 +1,72 @@
 #!/usr/bin/env bash
 # Prepare script for MLS English ASR recipe in icefall
 # This recipe uses on-the-fly feature extraction, so it skips manifest
 # and feature generation steps used in other recipes.
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 nj=15
 stage=-1
 stop_stage=100
 # Configuration for BPE tokenizer
 vocab_sizes=(2000)  # You can add more sizes like (500 1000 2000) for comparison
 # Directory where dataset will be downloaded
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # All files generated by this script are saved in "data".
 mkdir -p data
 log() {
    local fname=${BASH_SOURCE[1]##*/}
    echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "Starting MLS English data preparation"
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download MLS English dataset"
  if [ ! -d $dl_dir/mls_english ]; then
    if ! git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_english; then
      log "Failed to download MLS English dataset"
      exit 1
    fi
  fi
 fi
 mkdir -p data/lang
 lang_dir=data/lang
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare transcript for BPE training"
  if [ ! -f $lang_dir/transcript.txt ]; then
    log "Generating transcripts for BPE training"
    ./local/utils/generate_transcript.py --lang-dir $lang_dir
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Prepare BPE tokenizer"
  for vocab_size in ${vocab_sizes[@]}; do
    log "Training BPE model with vocab_size=${vocab_size}"
    bpe_dir=data/lang/bpe_${vocab_size}
    mkdir -p $bpe_dir
    if [ ! -f $bpe_dir/bpe.model ]; then
      ./local/train_bpe_model.py \
        --lang-dir $bpe_dir \
        --vocab-size $vocab_size \
        --transcript $lang_dir/transcript.txt
    fi
  done
 fi
 log "MLS English data preparation completed successfully"
--- a/egs/mls_english/ASR/zipformer/asr_datamodule.py
+++ b/egs/mls_english/ASR/zipformer/asr_datamodule.py
@ -0,0 +1 @@
 ../local/utils/asr_datamodule.py
--- a/egs/mls_english/ASR/zipformer/beam_search.py
+++ b/egs/mls_english/ASR/zipformer/beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/beam_search.py
--- a/egs/mls_english/ASR/zipformer/ctc_decode.py
+++ b/egs/mls_english/ASR/zipformer/ctc_decode.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/ctc_decode.py
--- a/egs/mls_english/ASR/zipformer/decode.py
+++ b/egs/mls_english/ASR/zipformer/decode.py
--- a/egs/mls_english/ASR/zipformer/decode_stream.py
+++ b/egs/mls_english/ASR/zipformer/decode_stream.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/decode_stream.py
--- a/egs/mls_english/ASR/zipformer/decoder.py
+++ b/egs/mls_english/ASR/zipformer/decoder.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/decoder.py
--- a/egs/mls_english/ASR/zipformer/do_not_use_it_directly.py
+++ b/egs/mls_english/ASR/zipformer/do_not_use_it_directly.py
--- a/egs/mls_english/ASR/zipformer/encoder_interface.py
+++ b/egs/mls_english/ASR/zipformer/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/encoder_interface.py
--- a/egs/mls_english/ASR/zipformer/export-onnx.py
+++ b/egs/mls_english/ASR/zipformer/export-onnx.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/export-onnx.py
--- a/egs/mls_english/ASR/zipformer/export.py
+++ b/egs/mls_english/ASR/zipformer/export.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/export.py
--- a/egs/mls_english/ASR/zipformer/generate_averaged_model.py
+++ b/egs/mls_english/ASR/zipformer/generate_averaged_model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/generate_averaged_model.py
--- a/egs/mls_english/ASR/zipformer/joiner.py
+++ b/egs/mls_english/ASR/zipformer/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/joiner.py
--- a/egs/mls_english/ASR/zipformer/model.py
+++ b/egs/mls_english/ASR/zipformer/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/model.py
--- a/egs/mls_english/ASR/zipformer/my_profile.py
+++ b/egs/mls_english/ASR/zipformer/my_profile.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/my_profile.py
--- a/egs/mls_english/ASR/zipformer/onnx_pretrained.py
+++ b/egs/mls_english/ASR/zipformer/onnx_pretrained.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/onnx_pretrained.py
--- a/egs/mls_english/ASR/zipformer/optim.py
+++ b/egs/mls_english/ASR/zipformer/optim.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/optim.py
--- a/egs/mls_english/ASR/zipformer/pretrained.py
+++ b/egs/mls_english/ASR/zipformer/pretrained.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/pretrained.py
--- a/egs/mls_english/ASR/zipformer/scaling.py
+++ b/egs/mls_english/ASR/zipformer/scaling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling.py
--- a/egs/mls_english/ASR/zipformer/scaling_converter.py
+++ b/egs/mls_english/ASR/zipformer/scaling_converter.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling_converter.py
--- a/egs/mls_english/ASR/zipformer/streaming_beam_search.py
+++ b/egs/mls_english/ASR/zipformer/streaming_beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/streaming_beam_search.py
--- a/egs/mls_english/ASR/zipformer/streaming_decode.py
+++ b/egs/mls_english/ASR/zipformer/streaming_decode.py
@ -0,0 +1,900 @@
 #!/usr/bin/env python3
 # Copyright 2022-2023 Xiaomi Corporation (Authors: Wei Kang,
 #                                                  Fangjun Kuang,
 #                                                  Zengwei Yao)
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 ./zipformer/streaming_decode.py--epoch 28   --avg 15   --causal 1   --chunk-size 32   --left-context-frames 256   --exp-dir ./zipformer/exp-large --lang data/lang_char --num-encoder-layers 2,2,4,5,4,2 --feedforward-dim 512,768,1536,2048,1536,768 --encoder-dim 192,256,512,768,512,256 --encoder-unmasked-dim 192,192,256,320,256,192
 """
 import argparse
 import logging
 import math
 import os
 import pdb
 import subprocess as sp
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import numpy as np
 import torch
 from asr_datamodule import ReazonSpeechAsrDataModule
 from decode_stream import DecodeStream
 from kaldifeat import Fbank, FbankOptions
 from lhotse import CutSet
 from streaming_beam_search import (
    fast_beam_search_one_best,
    greedy_search,
    modified_beam_search,
 )
 from tokenizer import Tokenizer
 from torch import Tensor, nn
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_model, get_params
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import (
    AttributeDict,
    make_pad_mask,
    setup_logger,
    store_transcripts,
    str2bool,
    write_error_stats,
 )
 LOG_EPS = math.log(1e-10)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=28,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="zipformer/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--lang-dir",
        type=Path,
        default="data/lang_char",
        help="The lang dir containing word table and LG graph",
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Supported decoding methods are:
        greedy_search
        modified_beam_search
        fast_beam_search
        """,
    )
    parser.add_argument(
        "--num_active_paths",
        type=int,
        default=4,
        help="""An interger indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=4,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=32,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
        "--num-decode-streams",
        type=int,
        default=2000,
        help="The number of streams that can be decoded parallel.",
    )
    add_model_arguments(parser)
    return parser
 def get_init_states(
    model: nn.Module,
    batch_size: int = 1,
    device: torch.device = torch.device("cpu"),
 ) -> List[torch.Tensor]:
    """
    Returns a list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
    is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
    states[-2] is the cached left padding for ConvNeXt module,
    of shape (batch_size, num_channels, left_pad, num_freqs)
    states[-1] is processed_lens of shape (batch,), which records the number
    of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    """
    states = model.encoder.get_init_states(batch_size, device)
    embed_states = model.encoder_embed.get_init_states(batch_size, device)
    states.append(embed_states)
    processed_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
    states.append(processed_lens)
    return states
 def stack_states(state_list: List[List[torch.Tensor]]) -> List[torch.Tensor]:
    """Stack list of zipformer states that correspond to separate utterances
    into a single emformer state, so that it can be used as an input for
    zipformer when those utterances are formed into a batch.
    Args:
      state_list:
        Each element in state_list corresponding to the internal state
        of the zipformer model for a single utterance. For element-n,
        state_list[n] is a list of cached tensors of all encoder layers. For layer-i,
        state_list[n][i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1,
        cached_val2, cached_conv1, cached_conv2).
        state_list[n][-2] is the cached left padding for ConvNeXt module,
          of shape (batch_size, num_channels, left_pad, num_freqs)
        state_list[n][-1] is processed_lens of shape (batch,), which records the number
        of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    Note:
      It is the inverse of :func:`unstack_states`.
    """
    batch_size = len(state_list)
    assert (len(state_list[0]) - 2) % 6 == 0, len(state_list[0])
    tot_num_layers = (len(state_list[0]) - 2) // 6
    batch_states = []
    for layer in range(tot_num_layers):
        layer_offset = layer * 6
        # cached_key: (left_context_len, batch_size, key_dim)
        cached_key = torch.cat(
            [state_list[i][layer_offset] for i in range(batch_size)], dim=1
        )
        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
        cached_nonlin_attn = torch.cat(
            [state_list[i][layer_offset + 1] for i in range(batch_size)], dim=1
        )
        # cached_val1: (left_context_len, batch_size, value_dim)
        cached_val1 = torch.cat(
            [state_list[i][layer_offset + 2] for i in range(batch_size)], dim=1
        )
        # cached_val2: (left_context_len, batch_size, value_dim)
        cached_val2 = torch.cat(
            [state_list[i][layer_offset + 3] for i in range(batch_size)], dim=1
        )
        # cached_conv1: (#batch, channels, left_pad)
        cached_conv1 = torch.cat(
            [state_list[i][layer_offset + 4] for i in range(batch_size)], dim=0
        )
        # cached_conv2: (#batch, channels, left_pad)
        cached_conv2 = torch.cat(
            [state_list[i][layer_offset + 5] for i in range(batch_size)], dim=0
        )
        batch_states += [
            cached_key,
            cached_nonlin_attn,
            cached_val1,
            cached_val2,
            cached_conv1,
            cached_conv2,
        ]
    cached_embed_left_pad = torch.cat(
        [state_list[i][-2] for i in range(batch_size)], dim=0
    )
    batch_states.append(cached_embed_left_pad)
    processed_lens = torch.cat([state_list[i][-1] for i in range(batch_size)], dim=0)
    batch_states.append(processed_lens)
    return batch_states
 def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
    """Unstack the zipformer state corresponding to a batch of utterances
    into a list of states, where the i-th entry is the state from the i-th
    utterance in the batch.
    Note:
      It is the inverse of :func:`stack_states`.
    Args:
        batch_states: A list of cached tensors of all encoder layers. For layer-i,
          states[i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1, cached_val2,
          cached_conv1, cached_conv2).
          state_list[-2] is the cached left padding for ConvNeXt module,
          of shape (batch_size, num_channels, left_pad, num_freqs)
          states[-1] is processed_lens of shape (batch,), which records the number
          of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    Returns:
        state_list: A list of list. Each element in state_list corresponding to the internal state
        of the zipformer model for a single utterance.
    """
    assert (len(batch_states) - 2) % 6 == 0, len(batch_states)
    tot_num_layers = (len(batch_states) - 2) // 6
    processed_lens = batch_states[-1]
    batch_size = processed_lens.shape[0]
    state_list = [[] for _ in range(batch_size)]
    for layer in range(tot_num_layers):
        layer_offset = layer * 6
        # cached_key: (left_context_len, batch_size, key_dim)
        cached_key_list = batch_states[layer_offset].chunk(chunks=batch_size, dim=1)
        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
        cached_nonlin_attn_list = batch_states[layer_offset + 1].chunk(
            chunks=batch_size, dim=1
        )
        # cached_val1: (left_context_len, batch_size, value_dim)
        cached_val1_list = batch_states[layer_offset + 2].chunk(
            chunks=batch_size, dim=1
        )
        # cached_val2: (left_context_len, batch_size, value_dim)
        cached_val2_list = batch_states[layer_offset + 3].chunk(
            chunks=batch_size, dim=1
        )
        # cached_conv1: (#batch, channels, left_pad)
        cached_conv1_list = batch_states[layer_offset + 4].chunk(
            chunks=batch_size, dim=0
        )
        # cached_conv2: (#batch, channels, left_pad)
        cached_conv2_list = batch_states[layer_offset + 5].chunk(
            chunks=batch_size, dim=0
        )
        for i in range(batch_size):
            state_list[i] += [
                cached_key_list[i],
                cached_nonlin_attn_list[i],
                cached_val1_list[i],
                cached_val2_list[i],
                cached_conv1_list[i],
                cached_conv2_list[i],
            ]
    cached_embed_left_pad_list = batch_states[-2].chunk(chunks=batch_size, dim=0)
    for i in range(batch_size):
        state_list[i].append(cached_embed_left_pad_list[i])
    processed_lens_list = batch_states[-1].chunk(chunks=batch_size, dim=0)
    for i in range(batch_size):
        state_list[i].append(processed_lens_list[i])
    return state_list
 def streaming_forward(
    features: Tensor,
    feature_lens: Tensor,
    model: nn.Module,
    states: List[Tensor],
    chunk_size: int,
    left_context_len: int,
 ) -> Tuple[Tensor, Tensor, List[Tensor]]:
    """
    Returns encoder outputs, output lengths, and updated states.
    """
    cached_embed_left_pad = states[-2]
    (x, x_lens, new_cached_embed_left_pad,) = model.encoder_embed.streaming_forward(
        x=features,
        x_lens=feature_lens,
        cached_left_pad=cached_embed_left_pad,
    )
    assert x.size(1) == chunk_size, (x.size(1), chunk_size)
    src_key_padding_mask = make_pad_mask(x_lens)
    # processed_mask is used to mask out initial states
    processed_mask = torch.arange(left_context_len, device=x.device).expand(
        x.size(0), left_context_len
    )
    processed_lens = states[-1]  # (batch,)
    # (batch, left_context_size)
    processed_mask = (processed_lens.unsqueeze(1) <= processed_mask).flip(1)
    # Update processed lengths
    new_processed_lens = processed_lens + x_lens
    # (batch, left_context_size + chunk_size)
    src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)
    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
    encoder_states = states[:-2]
    (
        encoder_out,
        encoder_out_lens,
        new_encoder_states,
    ) = model.encoder.streaming_forward(
        x=x,
        x_lens=x_lens,
        states=encoder_states,
        src_key_padding_mask=src_key_padding_mask,
    )
    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
    new_states = new_encoder_states + [
        new_cached_embed_left_pad,
        new_processed_lens,
    ]
    return encoder_out, encoder_out_lens, new_states
 def decode_one_chunk(
    params: AttributeDict,
    model: nn.Module,
    decode_streams: List[DecodeStream],
 ) -> List[int]:
    """Decode one chunk frames of features for each decode_streams and
    return the indexes of finished streams in a List.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      decode_streams:
        A List of DecodeStream, each belonging to a utterance.
    Returns:
      Return a List containing which DecodeStreams are finished.
    """
    # pdb.set_trace()
    # print(model)
    # print(model.device)
    # device = model.device
    chunk_size = int(params.chunk_size)
    left_context_len = int(params.left_context_frames)
    features = []
    feature_lens = []
    states = []
    processed_lens = []  # Used in fast-beam-search
    for stream in decode_streams:
        feat, feat_len = stream.get_feature_frames(chunk_size * 2)
        features.append(feat)
        feature_lens.append(feat_len)
        states.append(stream.states)
        processed_lens.append(stream.done_frames)
    feature_lens = torch.tensor(feature_lens, device=model.device)
    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
    # Make sure the length after encoder_embed is at least 1.
    # The encoder_embed subsample features (T - 7) // 2
    # The ConvNeXt module needs (7 - 1) // 2 = 3 frames of right padding after subsampling
    tail_length = chunk_size * 2 + 7 + 2 * 3
    if features.size(1) < tail_length:
        pad_length = tail_length - features.size(1)
        feature_lens += pad_length
        features = torch.nn.functional.pad(
            features,
            (0, 0, 0, pad_length),
            mode="constant",
            value=LOG_EPS,
        )
    states = stack_states(states)
    encoder_out, encoder_out_lens, new_states = streaming_forward(
        features=features,
        feature_lens=feature_lens,
        model=model,
        states=states,
        chunk_size=chunk_size,
        left_context_len=left_context_len,
    )
    encoder_out = model.joiner.encoder_proj(encoder_out)
    if params.decoding_method == "greedy_search":
        greedy_search(model=model, encoder_out=encoder_out, streams=decode_streams)
    elif params.decoding_method == "fast_beam_search":
        processed_lens = torch.tensor(processed_lens, device=model.device)
        processed_lens = processed_lens + encoder_out_lens
        fast_beam_search_one_best(
            model=model,
            encoder_out=encoder_out,
            processed_lens=processed_lens,
            streams=decode_streams,
            beam=params.beam,
            max_states=params.max_states,
            max_contexts=params.max_contexts,
        )
    elif params.decoding_method == "modified_beam_search":
        modified_beam_search(
            model=model,
            streams=decode_streams,
            encoder_out=encoder_out,
            num_active_paths=params.num_active_paths,
        )
    else:
        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    states = unstack_states(new_states)
    finished_streams = []
    for i in range(len(decode_streams)):
        decode_streams[i].states = states[i]
        decode_streams[i].done_frames += encoder_out_lens[i]
        # if decode_streams[i].done:
        # finished_streams.append(i)
        finished_streams.append(i)
    return finished_streams
 def decode_dataset(
    cuts: CutSet,
    params: AttributeDict,
    model: nn.Module,
    tokenizer: Tokenizer,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      cuts:
        Lhotse Cutset containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      tokenizer:
        The BPE model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    device = model.device
    opts = FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = 16000
    opts.mel_opts.num_bins = 80
    log_interval = 100
    decode_results = []
    # Contain decode streams currently running.
    decode_streams = []
    for num, cut in enumerate(cuts):
        # each utterance has a DecodeStream.
        initial_states = get_init_states(model=model, batch_size=1, device=device)
        decode_stream = DecodeStream(
            params=params,
            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
        )
        audio: np.ndarray = cut.load_audio()
        # audio.shape: (1, num_samples)
        assert len(audio.shape) == 2
        assert audio.shape[0] == 1, "Should be single channel"
        assert audio.dtype == np.float32, audio.dtype
        # The trained model is using normalized samples
        # - this is to avoid sending [-32k,+32k] signal in...
        # - some lhotse AudioTransform classes can make the signal
        #   be out of range [-1, 1], hence the tolerance 10
        assert (
            np.abs(audio).max() <= 10
        ), "Should be normalized to [-1, 1], 10 for tolerance..."
        samples = torch.from_numpy(audio).squeeze(0)
        fbank = Fbank(opts)
        feature = fbank(samples.to(device))
        decode_stream.set_features(feature, tail_pad_len=30)
        decode_stream.ground_truth = cut.supervisions[0].text
        decode_streams.append(decode_stream)
        while len(decode_streams) >= params.num_decode_streams:
            finished_streams = decode_one_chunk(
                params=params, model=model, decode_streams=decode_streams
            )
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        tokenizer.decode(decode_streams[i].decoding_result()).split(),
                    )
                )
                del decode_streams[i]
        if num % log_interval == 0:
            logging.info(f"Cuts processed until now is {num}.")
    # decode final chunks of last sequences
    while len(decode_streams):
        # print("INSIDE LEN DECODE STREAMS")
        # pdb.set_trace()
        # print(model.device)
        # test_device = model.device
        # print("done")
        finished_streams = decode_one_chunk(
            params=params, model=model, decode_streams=decode_streams
        )
        # print('INSIDE FOR LOOP ')
        # print(finished_streams)
        if not finished_streams:
            print("No finished streams, breaking the loop")
            break
        for i in sorted(finished_streams, reverse=True):
            try:
                decode_results.append(
                    (
                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        tokenizer.decode(decode_streams[i].decoding_result()).split(),
                    )
                )
                del decode_streams[i]
            except IndexError as e:
                print(f"IndexError: {e}")
                print(f"decode_streams length: {len(decode_streams)}")
                print(f"finished_streams: {finished_streams}")
                print(f"i: {i}")
                continue
    if params.decoding_method == "greedy_search":
        key = "greedy_search"
    elif params.decoding_method == "fast_beam_search":
        key = (
            f"beam_{params.beam}_"
            f"max_contexts_{params.max_contexts}_"
            f"max_states_{params.max_states}"
        )
    elif params.decoding_method == "modified_beam_search":
        key = f"num_active_paths_{params.num_active_paths}"
    else:
        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    torch.cuda.synchronize()
    return {key: decode_results}
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    ReazonSpeechAsrDataModule.add_arguments(parser)
    Tokenizer.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    assert params.causal, params.causal
    assert "," not in params.chunk_size, "chunk_size should be one value in decoding."
    assert (
        "," not in params.left_context_frames
    ), "left_context_frames should be one value in decoding."
    params.suffix += f"-chunk-{params.chunk_size}"
    params.suffix += f"-left-context-{params.left_context_frames}"
    # for fast_beam_search
    if params.decoding_method == "fast_beam_search":
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    sp_token = Tokenizer.load(params.lang, params.lang_type)
    # <blk> and <unk> is defined in local/train_bpe_model.py
    params.blank_id = sp_token.piece_to_id("<blk>")
    params.unk_id = sp_token.piece_to_id("<unk>")
    params.vocab_size = sp_token.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if start >= 0:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to(device)
    model.eval()
    model.device = device
    decoding_graph = None
    if params.decoding_method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    reazonspeech_corpus = ReazonSpeechAsrDataModule(args)
    valid_cuts = reazonspeech_corpus.valid_cuts()
    test_cuts = reazonspeech_corpus.test_cuts()
    test_sets = ["valid", "test"]
    test_cuts = [valid_cuts, test_cuts]
    for test_set, test_cut in zip(test_sets, test_cuts):
        results_dict = decode_dataset(
            cuts=test_cut,
            params=params,
            model=model,
            tokenizer=sp_token,
            decoding_graph=decoding_graph,
        )
        save_results(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
    # valid_cuts = reazonspeech_corpus.valid_cuts()
    # for valid_cut in valid_cuts:
    #     results_dict = decode_dataset(
    #         cuts=valid_cut,
    #         params=params,
    #         model=model,
    #         sp=sp,
    #         decoding_graph=decoding_graph,
    #     )
    #     save_results(
    #         params=params,
    #         test_set_name="valid",
    #         results_dict=results_dict,
    #     )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/mls_english/ASR/zipformer/subsampling.py
+++ b/egs/mls_english/ASR/zipformer/subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/subsampling.py
--- a/egs/mls_english/ASR/zipformer/test_scaling.py
+++ b/egs/mls_english/ASR/zipformer/test_scaling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/test_scaling.py
--- a/egs/mls_english/ASR/zipformer/test_subsampling.py
+++ b/egs/mls_english/ASR/zipformer/test_subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/test_subsampling.py
--- a/egs/mls_english/ASR/zipformer/tokenizer.py
+++ b/egs/mls_english/ASR/zipformer/tokenizer.py
@ -0,0 +1,252 @@
 import argparse
 from pathlib import Path
 from typing import Callable, List, Union
 import sentencepiece as spm
 from k2 import SymbolTable
 class Tokenizer:
    text2word: Callable[[str], List[str]]
    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser):
        group = parser.add_argument_group(title="Lang related options")
        group.add_argument("--lang", type=Path, help="Path to lang directory.")
        group.add_argument(
            "--lang-type",
            type=str,
            default=None,
            help=(
                "Either 'bpe' or 'char'. If not provided, it expects lang_dir/lang_type to exists. "
                "Note: 'bpe' directly loads sentencepiece.SentencePieceProcessor"
            ),
        )
    @staticmethod
    def Load(lang_dir: Path, lang_type="", oov="<unk>"):
        if not lang_type:
            assert (lang_dir / "lang_type").exists(), "lang_type not specified."
            lang_type = (lang_dir / "lang_type").read_text().strip()
        tokenizer = None
        if lang_type == "bpe":
            assert (
                lang_dir / "bpe.model"
            ).exists(), f"No BPE .model could be found in {lang_dir}."
            tokenizer = spm.SentencePieceProcessor()
            tokenizer.Load(str(lang_dir / "bpe.model"))
        elif lang_type == "char":
            tokenizer = CharTokenizer(lang_dir, oov=oov)
        else:
            raise NotImplementedError(f"{lang_type} not supported at the moment.")
        return tokenizer
    load = Load
    def PieceToId(self, piece: str) -> int:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    piece_to_id = PieceToId
    def IdToPiece(self, id: int) -> str:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    id_to_piece = IdToPiece
    def GetPieceSize(self) -> int:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    get_piece_size = GetPieceSize
    def __len__(self) -> int:
        return self.get_piece_size()
    def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def EncodeAsIds(self, input: str) -> List[int]:
        return self.EncodeAsIdsBatch([input])[0]
    def EncodeAsPieces(self, input: str) -> List[str]:
        return self.EncodeAsPiecesBatch([input])[0]
    def Encode(
        self, input: Union[str, List[str]], out_type=int
    ) -> Union[List, List[List]]:
        if not input:
            return []
        if isinstance(input, list):
            if out_type is int:
                return self.EncodeAsIdsBatch(input)
            if out_type is str:
                return self.EncodeAsPiecesBatch(input)
        if out_type is int:
            return self.EncodeAsIds(input)
        if out_type is str:
            return self.EncodeAsPieces(input)
    encode = Encode
    def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def DecodeIds(self, input: List[int]) -> str:
        return self.DecodeIdsBatch([input])[0]
    def DecodePieces(self, input: List[str]) -> str:
        return self.DecodePiecesBatch([input])[0]
    def Decode(
        self,
        input: Union[int, List[int], List[str], List[List[int]], List[List[str]]],
    ) -> Union[List[str], str]:
        if not input:
            return ""
        if isinstance(input, int):
            return self.id_to_piece(input)
        elif isinstance(input, str):
            raise TypeError(
                "Unlike spm.SentencePieceProcessor, cannot decode from type str."
            )
        if isinstance(input[0], list):
            if not input[0] or isinstance(input[0][0], int):
                return self.DecodeIdsBatch(input)
            if isinstance(input[0][0], str):
                return self.DecodePiecesBatch(input)
        if isinstance(input[0], int):
            return self.DecodeIds(input)
        if isinstance(input[0], str):
            return self.DecodePieces(input)
        raise RuntimeError("Unknown input type")
    decode = Decode
    def SplitBatch(self, input: List[str]) -> List[List[str]]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def Split(self, input: Union[List[str], str]) -> Union[List[List[str]], List[str]]:
        if isinstance(input, list):
            return self.SplitBatch(input)
        elif isinstance(input, str):
            return self.SplitBatch([input])[0]
        raise RuntimeError("Unknown input type")
    split = Split
 class CharTokenizer(Tokenizer):
    def __init__(self, lang_dir: Path, oov="<unk>", sep=""):
        assert (
            lang_dir / "tokens.txt"
        ).exists(), f"tokens.txt could not be found in {lang_dir}."
        token_table = SymbolTable.from_file(lang_dir / "tokens.txt")
        assert (
            "#0" not in token_table
        ), "This tokenizer does not support disambig symbols."
        self._id2sym = token_table._id2sym
        self._sym2id = token_table._sym2id
        self.oov = oov
        self.oov_id = self._sym2id[oov]
        self.sep = sep
        if self.sep:
            self.text2word = lambda x: x.split(self.sep)
        else:
            self.text2word = lambda x: list(x.replace(" ", ""))
    def piece_to_id(self, piece: str) -> int:
        try:
            return self._sym2id[piece]
        except KeyError:
            return self.oov_id
    def id_to_piece(self, id: int) -> str:
        return self._id2sym[id]
    def get_piece_size(self) -> int:
        return len(self._sym2id)
    def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
        return [[self.piece_to_id(i) for i in self.text2word(text)] for text in input]
    def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
        return [
            [i if i in self._sym2id else self.oov for i in self.text2word(text)]
            for text in input
        ]
    def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
        return [self.sep.join(self.id_to_piece(i) for i in text) for text in input]
    def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
        return [self.sep.join(text) for text in input]
    def SplitBatch(self, input: List[str]) -> List[List[str]]:
        return [self.text2word(text) for text in input]
 def test_CharTokenizer():
    test_single_string = "こんにちは"
    test_multiple_string = [
        "今日はいい天気ですよね",
        "諏訪湖は綺麗でしょう",
        "这在词表外",
        "分かち 書き に し た 文章 です",
        "",
    ]
    test_empty_string = ""
    sp = Tokenizer.load(Path("lang_char"), "char", oov="<unk>")
    splitter = sp.split
    print(sp.encode(test_single_string, out_type=str))
    print(sp.encode(test_single_string, out_type=int))
    print(sp.encode(test_multiple_string, out_type=str))
    print(sp.encode(test_multiple_string, out_type=int))
    print(sp.encode(test_empty_string, out_type=str))
    print(sp.encode(test_empty_string, out_type=int))
    print(sp.decode(sp.encode(test_single_string, out_type=str)))
    print(sp.decode(sp.encode(test_single_string, out_type=int)))
    print(sp.decode(sp.encode(test_multiple_string, out_type=str)))
    print(sp.decode(sp.encode(test_multiple_string, out_type=int)))
    print(sp.decode(sp.encode(test_empty_string, out_type=str)))
    print(sp.decode(sp.encode(test_empty_string, out_type=int)))
    print(splitter(test_single_string))
    print(splitter(test_multiple_string))
    print(splitter(test_empty_string))
 if __name__ == "__main__":
    test_CharTokenizer()
--- a/egs/mls_english/ASR/zipformer/train.py
+++ b/egs/mls_english/ASR/zipformer/train.py
--- a/egs/mls_english/ASR/zipformer/zipformer.py
+++ b/egs/mls_english/ASR/zipformer/zipformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/zipformer.py
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/beam_search.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/ctc_decode.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/decode_stream.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/encoder_interface.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/export-onnx.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/generate_averaged_model.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/joiner.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/my_profile.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/onnx_pretrained.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/pretrained.py`