Merge 25f465c8c061a04b74b3a71a24215a691b863c39 into 34fc1fdf0d8ff520e2bb18267d046ca207c78ef9

2025-08-09 01:52:41 +00:00 · 2025-07-24 05:30:05 +02:00 · 2025-07-24 05:30:05 +02:00 · ca404076d6
commit ca404076d6
parent 34fc1fdf0d 25f465c8c0
19 changed files with 3548 additions and 0 deletions
--- a/egs/mls/ASR/local/compute_fbank_mls.py
+++ b/egs/mls/ASR/local/compute_fbank_mls.py
@ -0,0 +1,137 @@
 #!/usr/bin/env python3
 # Copyright    2024  Xiaomi Corp.        (authors: Xiaoyu Yang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the LibriSpeech dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 import sentencepiece as spm
 import torch
 from filter_cuts import filter_cuts
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor, str2bool
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--manifest-dir",
        type=str,
        default="data/manifests",
    )
    parser.add_argument(
        "--fbank-dir",
        type=str,
        default="data/fbank_mls",
    )
    parser.add_argument(
        "--part",
        type=str,
        help="Which language to prepare, if all, prepare all languages",
        choices=["english", "dutch", "german", "spanish", "french", "italian", "polish", "portuguese", "all"]
    )
    return parser.parse_args()
 def compute_fbank_mls(
    manifest_dir=str,
    fbank_dir=str,
    part=str,
 ):
    src_dir = Path("data/manifests")
    output_dir = Path(fbank_dir)
    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    if part == "all":
        dataset_parts = [
            "english",
            "dutch",
            "german",
            "spanish"
        ]
    else:
        dataset_parts = [part]
    splits = ["train", "test", "dev"]
    num_jobs = 15
    num_mel_bins = 80
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    for language in dataset_parts:
        for split in splits:
            recording_file = src_dir / f"mls-{language}_recordings_{split}.jsonl.gz"
            supervision_file = src_dir / f"mls-{language}_supervisions_{split}.jsonl.gz"
            recordings = CutSet.from_file(recording_file)
            supervisions = CutSet.from_file(supervision_file)
            cut_set = CutSet.from_manifests(
                recordings=recordings,
                supervisions=supervisions,
            )
            prefix = f"mls-{language}"
            with get_executor() as ex:
                cut_set = cut_set.compute_and_store_features(
                    extractor=extractor,
                    storage_path=f"{output_dir}/{prefix}_feats_{split}",
                    # when an executor is specified, make more partitions
                    num_jobs=num_jobs if ex is None else 80,
                    executor=ex,
                    storage_type=LilcomChunkyWriter,
                )
            cuts_filename = output_dir / f"mls-{language}_{split}.jsonl.gz"
            logging.info(f"Saving to {cuts_filename}")
            cut_set.to_file(cuts_filename)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(vars(args))
    compute_fbank_mls(
        manifest_dir=args.manifest_dir,
        fbank_dir=args.fbank_dir,
        part=args.part,
    )
--- a/egs/mls/ASR/local/compute_fbank_mls_splits.py
+++ b/egs/mls/ASR/local/compute_fbank_mls_splits.py
@ -0,0 +1,172 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from datetime import datetime
 from pathlib import Path
 import torch
 from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--num-workers",
        type=int,
        default=20,
        help="Number of dataloading workers used for reading the audio.",
    )
    parser.add_argument(
        "--batch-duration",
        type=float,
        default=600.0,
        help="The maximum number of audio seconds in a batch."
        "Determines batch size dynamically.",
    )
    parser.add_argument(
        "--language",
        type=str,
        default="english",
    )
    parser.add_argument(
        "--num-splits",
        type=int,
        required=True,
        help="The number of splits of the English subset",
    )
    parser.add_argument(
        "--start",
        type=int,
        default=0,
        help="Process pieces starting from this number (inclusive).",
    )
    parser.add_argument(
        "--stop",
        type=int,
        default=-1,
        help="Stop processing pieces until this number (exclusive).",
    )
    parser.add_argument(
        "--fbank-dir",
        type=str,
        default="data/fbank_mls"
    )
    return parser
 def compute_fbank_mls_splits(args):
    num_splits = args.num_splits
    output_dir = f"{args.fbank_dir}/{args.language}_split"
    output_dir = Path(output_dir)
    assert output_dir.exists(), f"{output_dir} does not exist!"
    num_digits = 8  # num_digits is fixed by lhotse split-lazy
    start = args.start
    stop = args.stop
    if stop < start:
        stop = num_splits
    stop = min(stop, num_splits)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")
    for i in range(start, stop):
        idx = f"{i}".zfill(num_digits)
        logging.info(f"Processing {idx}/{num_splits}")
        cuts_path = output_dir / f"mls-{args.language}_train.{idx}.jsonl.gz"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue
        raw_cuts_path = output_dir / f"mls-{args.language}_train_raw.{idx}.jsonl.gz"
        logging.info(f"Loading {raw_cuts_path}")
        cut_set = CutSet.from_file(raw_cuts_path)
        logging.info("Computing features")
        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
            storage_path=f"{output_dir}/feats_{args.language}_{idx}",
            num_workers=args.num_workers,
            batch_duration=args.batch_duration,
            overwrite=True,
        )
        logging.info("About to split cuts into smaller chunks.")
        cut_set = cut_set.trim_to_supervisions(
            keep_overlapping=False, min_duration=None
        )
        logging.info(f"Saving to {cuts_path}")
        cut_set.to_file(cuts_path)
        logging.info(f"Saved to {cuts_path}")
 def main():
    now = datetime.now()
    date_time = now.strftime("%Y-%m-%d-%H-%M-%S")
    log_filename = "log-compute_fbank_mls_splits"
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    log_filename = f"{log_filename}-{date_time}"
    logging.basicConfig(
        filename=log_filename,
        format=formatter,
        level=logging.INFO,
        filemode="w",
    )
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    console.setFormatter(logging.Formatter(formatter))
    logging.getLogger("").addHandler(console)
    parser = get_parser()
    args = parser.parse_args()
    logging.info(vars(args))
    compute_fbank_mls_splits(args)
 if __name__ == "__main__":
    main()
--- a/egs/mls/ASR/local/train_bpe_model.py
+++ b/egs/mls/ASR/local/train_bpe_model.py
@ -0,0 +1,114 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 # Copyright    2024  Xiaomi Corp.        (authors: Xiaoyu Yang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # You can install sentencepiece via:
 #
 #  pip install sentencepiece
 #
 # Due to an issue reported in
 # https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
 #
 # Please install a version >=0.1.96
 import argparse
 import shutil
 from pathlib import Path
 import sentencepiece as spm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        The generated bpe.model is saved to this directory.
        """,
    )
    parser.add_argument(
        "--byte-fallback",
        action="store_true",
        help="""Whether to enable byte_fallback when training bpe.""",
    )
    parser.add_argument(
        "--character-coverage",
        type=float,
        default=1.0,
        help="Character coverage in vocabulary.",
    )
    parser.add_argument(
        "--transcript",
        type=str,
        help="Training transcript.",
    )
    parser.add_argument(
        "--vocab-size",
        type=int,
        help="Vocabulary size for BPE training",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    vocab_size = args.vocab_size
    lang_dir = Path(args.lang_dir)
    model_type = "bpe"
    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
    train_text = args.transcript
    input_sentence_size = 100000000
    user_defined_symbols = ["<blk>", "<sos/eos>"]
    unk_id = len(user_defined_symbols)
    # Note: unk_id is fixed to 2.
    # If you change it, you should also change other
    # places that are using it.
    model_file = Path(model_prefix + ".model")
    if not model_file.is_file():
        spm.SentencePieceTrainer.train(
            input=train_text,
            vocab_size=vocab_size,
            model_type=model_type,
            model_prefix=model_prefix,
            input_sentence_size=input_sentence_size,
            character_coverage=args.character_coverage,
            user_defined_symbols=user_defined_symbols,
            byte_fallback=args.byte_fallback,
            unk_id=unk_id,
            bos_id=-1,
            eos_id=-1,
        )
    else:
        print(f"{model_file} exists - skipping")
        return
    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
 if __name__ == "__main__":
    main()
--- a/egs/mls/ASR/local/validate_manifest.py
+++ b/egs/mls/ASR/local/validate_manifest.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/validate_manifest.py
--- a/egs/mls/ASR/prepare.sh
+++ b/egs/mls/ASR/prepare.sh
@ -0,0 +1,227 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 nj=15
 # run step 0 to step 5 by default
 stage=0
 stop_stage=5
 # Note: This script just prepare the minimal requirements that needed by a
 # transducer training with bpe units.
 #
 # If you want to use ngram or nnlm, please continue running prepare_lm.sh after
 # you succeed running this script.
 #
 # This script also contains the steps to generate phone based units, but they
 # will not run automatically, you can generate the phone based units by
 # bash prepare.sh --stage -1 --stop-stage -1
 # bash prepare.sh --stage 6 --stop-stage 6
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
 # by this script automatically.
 #
 #  - $dl_dir/LibriSpeech
 #      You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it.
 #      You can download them from https://www.openslr.org/12
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
 #       http://www.openslr.org/17/
 #
 #     - music
 #     - noise
 #     - speech
 num_per_split=4000
 fbank_dir=data/fbank_mls
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # vocab size for sentence piece models.
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
  # 5000
  2000
  1000
  # 500
 )
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "Running prepare.sh"
 log "dl_dir: $dl_dir"
 log "fbank_dir: $fbank_dir"
 languages=(
  english 
  german
  dutch
  spanish
  italian
  french
  polish
  portuguese
 )
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  # If you have pre-downloaded it to /path/to/MLS,
  # you can create a symlink
  #
  #   ln -sfv /path/to/MLS $dl_dir/MLS
  #
  if [ ! -d $dl_dir/MLS/train-other-500 ]; then
    lhotse download mls --full $dl_dir
  fi
  # If you have pre-downloaded it to /path/to/musan,
  # you can create a symlink
  #
  #   ln -sfv /path/to/musan $dl_dir/
  #
  if [ ! -d $dl_dir/musan ]; then
    lhotse download musan $dl_dir
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare MLS manifest"
  # We assume that you have downloaded the MLS corpus
  # to $dl_dir/MLS
  mkdir -p data/manifests
  if [ ! -e data/manifests/.mls.done ]; then
    lhotse prepare mls -j $nj $dl_dir/MLS data/manifests
    touch data/manifests/.mls.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Prepare musan manifest"
  # We assume that you have downloaded the musan corpus
  # to $dl_dir/musan
  mkdir -p data/manifests
  if [ ! -e data/manifests/.musan.done ]; then
    lhotse prepare musan $dl_dir/musan data/manifests
    touch data/manifests/.musan.done
  fi
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Split english subset into pieces (may take 30 minutes)"
  split_dir=${fbank_dir}/english_split
  if [ ! -f $split_dir/.split_completed ]; then
    lhotse split-lazy ${fbank_dir}/mls-english_train_raw.jsonl.gz $split_dir $num_per_split
    touch $split_dir/.split_completed
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for MLS (except English)"
  mkdir -p ${fbank_dir}
  if [ ! -e ${fbank_dir}/.mls.done ]; then
    ./local/compute_fbank_mls.py
    touch ${fbank_dir}/.mls.done
  fi
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
 log "Stage 5: Compute fbank for English split of MLS"
  if [ ! -e ${fbank_dir}/.mls-english.done ]; then
    num_splits=$(find ${fbank_dir}/english_split -name "mls-english_train_raw.*.jsonl.gz" | wc -l)
    ./local/compute_fbank_mls_splits.py \
      --fbank-dir $fbank_dir \
      --num-workers 20 \
      --language english \
      --num-splits $num_splits \
    touch ${fbank_dir}/.mls-english.done
  fi
  if [ ! -e ${fbank_dir}/mls-english_train.jsonl.gz ]; then
    pieces=$(find ${fbank_dir}/english_split -name "mls-english_train.*.jsonl.gz")
    lhotse combine $pieces ${fbank_dir}/mls-english_train.jsonl.gz
  fi
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Validate the manifest of MLS"
  if [ ! -e ${fbank_dir}/.mls-validated.done ]; then
    log "Validating the fbank features for MLS"
    parts=(
      train
      dev
      test
    )
    for lan in ${languages[@]}; do
      for part in ${parts[@]}; do
        python3 ./local/validate_manifest.py \
          ${fbank_dir}/mls-${lan}_${part}.jsonl.gz
      done
    done
    touch ${fbank_dir}/.mls-validated.done
  fi
 fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Compute fbank for musan"
  mkdir -p ${fbank_dir}
  if [ ! -e ${fbank_dir}/.musan.done ]; then
    ./local/compute_fbank_musan.py
    touch ${fbank_dir}/.musan.done
  fi
 fi
 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
  log "Stage 8: Prepare BPE based lang"
  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bpe_${vocab_size}
    mkdir -p $lang_dir
    if [ ! -f $lang_dir/transcript_words.txt ]; then
      log "Generate data for BPE training"
      files=(
        "$dl_dir/MLS/mls_english/train/transcripts.txt"
        "$dl_dir/MLS/mls_german/train/transcripts.txt"
        "$dl_dir/MLS/mls_dutch/train/transcripts.txt"
        "$dl_dir/MLS/mls_french/train/transcripts.txt"
        "$dl_dir/MLS/mls_spanish/train/transcripts.txt"
        "$dl_dir/MLS/mls_italian/train/transcripts.txt"
        "$dl_dir/MLS/mls_portuguese/train/transcripts.txt"
        "$dl_dir/MLS/mls_polish/train/transcripts.txt"
      )
      for f in ${files[@]}; do
        head -n 1000000 $f | cut -d " " -f 2-
      done > $lang_dir/transcript_words.txt
    fi
    if [ ! -f $lang_dir/bpe.model ]; then
      ./local/train_bpe_model.py \
        --lang-dir $lang_dir \
        --vocab-size $vocab_size \
        --character-coverage 0.999 \
        --transcript $lang_dir/transcript_words.txt \
        --byte-fallback 
    fi
  done
 fi
--- a/egs/mls/ASR/shared
+++ b/egs/mls/ASR/shared
@ -0,0 +1 @@
 ../../../icefall/shared
--- a/egs/mls/ASR/zipformer/asr_datamodule.py
+++ b/egs/mls/ASR/zipformer/asr_datamodule.py
@ -0,0 +1,441 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2024  Xiaomi Corporation     (Author: Xiaoyu Yang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SimpleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
    AudioSamples,
    OnTheFlyFeatures,
 )
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 class _SeedWorkers:
    def __init__(self, seed: int):
        self.seed = seed
    def __call__(self, worker_id: int):
        fix_random_seed(self.seed + worker_id)
 class MLSAsrDataModule:
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--language",
            type=str,
            default="all",
            # choices=["english", "german", "dutch", "french", "spanish", "italian", "portuguese", "polish", "all"],
            help="""A list of languages separated by comma. If all, use all
            the languages""",
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--drop-last",
            type=str2bool,
            default=True,
            help="Whether to drop last batch. Used by sampler.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
        group.add_argument(
            "--enable-spec-aug",
            type=str2bool,
            default=True,
            help="When enabled, use SpecAugment for training dataset.",
        )
        group.add_argument(
            "--spec-aug-time-warp-factor",
            type=int,
            default=80,
            help="Used only when --enable-spec-aug is True. "
            "It specifies the factor for time warping in SpecAugment. "
            "Larger values mean more warping. "
            "A value less than 1 means to disable time warp.",
        )
        group.add_argument(
            "--enable-musan",
            type=str2bool,
            default=True,
            help="When enabled, select noise from MUSAN and mix it"
            "with training dataset. ",
        )
        group.add_argument(
            "--input-strategy",
            type=str,
            default="PrecomputedFeatures",
            help="AudioSamples or PrecomputedFeatures",
        )
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        sampler_state_dict: Optional[Dict[str, Any]] = None,
    ) -> DataLoader:
        """
        Args:
          cuts_train:
            CutSet for training.
          sampler_state_dict:
            The state dict for the training sampler.
        """
        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            logging.info("About to get Musan cuts")
            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
            transforms.append(
                CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
            )
        else:
            logging.info("Disable MUSAN")
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        input_transforms = []
        if self.args.enable_spec_aug:
            logging.info("Enable SpecAugment")
            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
                )
            )
        else:
            logging.info("Disable SpecAugment")
        logging.info("About to create train dataset")
        train = K2SpeechRecognitionDataset(
            input_strategy=eval(self.args.input_strategy)(),
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
                input_transforms=input_transforms,
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using DynamicBucketingSampler.")
            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
            logging.info("Using SimpleCutSampler.")
            train_sampler = SimpleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        if sampler_state_dict is not None:
            logging.info("Loading sampler state dict")
            train_sampler.load_state_dict(sampler_state_dict)
        # 'seed' is derived from the current random state, which will have
        # previously been set in the main process.
        seed = torch.randint(0, 100000, ()).item()
        worker_init_fn = _SeedWorkers(seed)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
            worker_init_fn=worker_init_fn,
        )
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
            else eval(self.args.input_strategy)(),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=sampler,
            num_workers=self.args.num_workers,
        )
        return test_dl
    @lru_cache()
    def train_mls_cuts(self) -> CutSet:
        if self.args.language == "all":
            languages = ["english", "german", "dutch", "french", "spanish", "italian", "portuguese", "polish",]
        else:
            languages = self.args.language.split(",")
        if len(languages) == 1:
            l = languages[0]
            logging.info(f"About to get {l} cuts")
            return load_manifest_lazy(
                self.args.manifest_dir / f"mls-{l}_train.jsonl.gz"
            )
        else:
            all_cuts = []
            all_cuts_len = []
            for l in languages:
                logging.info(f"About to get {l} cuts")
                current_cuts = load_manifest_lazy(
                    self.args.manifest_dir / f"mls-{l}_train.jsonl.gz"
                )
                current_cuts_len = len(current_cuts)
                all_cuts.append(current_cuts)
                all_cuts_len.append(current_cuts_len)
            return CutSet.mux(
                *all_cuts,
                weights=all_cuts_len,
                stop_early=True,
            )
    @lru_cache()
    def mls_dev_cuts(self, language: str) -> CutSet:
        logging.info(f"About to get dev cuts for {language}")
        return load_manifest_lazy(
            self.args.manifest_dir / f"mls-{language}_dev.jsonl.gz"
        )
    @lru_cache()
    def mls_test_cuts(self, language: str) -> CutSet:
        logging.info(f"About to get test cuts for {language}")
        return load_manifest_lazy(
            self.args.manifest_dir / f"mls-{language}_test.jsonl.gz"
        )
--- a/egs/mls/ASR/zipformer/beam_search.py
+++ b/egs/mls/ASR/zipformer/beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py
--- a/egs/mls/ASR/zipformer/decode.py
+++ b/egs/mls/ASR/zipformer/decode.py
--- a/egs/mls/ASR/zipformer/decoder.py
+++ b/egs/mls/ASR/zipformer/decoder.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/decoder.py
--- a/egs/mls/ASR/zipformer/encoder_interface.py
+++ b/egs/mls/ASR/zipformer/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/encoder_interface.py
--- a/egs/mls/ASR/zipformer/joiner.py
+++ b/egs/mls/ASR/zipformer/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/joiner.py
--- a/egs/mls/ASR/zipformer/model.py
+++ b/egs/mls/ASR/zipformer/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/model.py
--- a/egs/mls/ASR/zipformer/optim.py
+++ b/egs/mls/ASR/zipformer/optim.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/optim.py
--- a/egs/mls/ASR/zipformer/scaling.py
+++ b/egs/mls/ASR/zipformer/scaling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling.py
--- a/egs/mls/ASR/zipformer/scaling_converter.py
+++ b/egs/mls/ASR/zipformer/scaling_converter.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling_converter.py
--- a/egs/mls/ASR/zipformer/subsampling.py
+++ b/egs/mls/ASR/zipformer/subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/subsampling.py
--- a/egs/mls/ASR/zipformer/train.py
+++ b/egs/mls/ASR/zipformer/train.py
--- a/egs/mls/ASR/zipformer/zipformer.py
+++ b/egs/mls/ASR/zipformer/zipformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/zipformer.py
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/validate_manifest.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/decoder.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer_stateless/encoder_interface.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/joiner.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/scaling.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/subsampling.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/zipformer.py`