diff --git a/egs/aishell/ASR/conformer_mmi/transformer.py b/egs/aishell/ASR/conformer_mmi/transformer.py index a3e50e385..dfd888414 100644 --- a/egs/aishell/ASR/conformer_mmi/transformer.py +++ b/egs/aishell/ASR/conformer_mmi/transformer.py @@ -545,6 +545,7 @@ class TransformerDecoderLayer(nn.Module): memory_mask: Optional[torch.Tensor] = None, tgt_key_padding_mask: Optional[torch.Tensor] = None, memory_key_padding_mask: Optional[torch.Tensor] = None, + **kwargs, ) -> torch.Tensor: """Pass the inputs (and mask) through the decoder layer. diff --git a/egs/mls_english/ASR/README.md b/egs/mls_english/ASR/README.md index bacc237db..cb8f51f46 100644 --- a/egs/mls_english/ASR/README.md +++ b/egs/mls_english/ASR/README.md @@ -5,7 +5,6 @@ **Multilingual LibriSpeech (MLS)** is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish. It includes about 44.5K hours of English and a total of about 6K hours for other languages. This icefall training recipe was created for the restructured version of the English split of the dataset available on Hugging Face below. - The dataset is available on Hugging Face. For more details, please visit: - Dataset: https://huggingface.co/datasets/parler-tts/mls_eng @@ -14,6 +13,7 @@ The dataset is available on Hugging Face. For more details, please visit: ## On-the-fly feature computation -This recipe currently only supports on-the-fly feature bank computation, since `lhotse` manifests and feature banks are not pre-calculated in this recipe. This should mean that the dataset can be streamed from Hugging Face, but we have not tested this yet. We may add a version that supports pre-calculating features to better match existing recipes. +This recipe currently only supports on-the-fly feature bank computation, since `lhotse` manifests and feature banks are not pre-calculated in this recipe. This should mean that the dataset can be streamed from Hugging Face, but we have not tested this yet. We may add a version that supports pre-calculating features to better match existing recipes.\ +
- +[./RESULTS.md](./RESULTS.md) contains the latest results. This MLS English recipe was primarily developed for use in the ```multi_ja_en``` Japanese-English bilingual pipeline, which is based on MLS English and ReazonSpeech. diff --git a/egs/mls_english/ASR/RESULTS.md b/egs/mls_english/ASR/RESULTS.md new file mode 100644 index 000000000..5c29fb631 --- /dev/null +++ b/egs/mls_english/ASR/RESULTS.md @@ -0,0 +1,41 @@ +## Results + +### MLS-English training results (Non-streaming) on zipformer model + +#### Non-streaming + +**WER on Test Set (Epoch 20)** + +| Type | Greedy | Beam search | +|---------------|--------|-------------| +| Non-streaming | 6.65 | 6.57 | + + +The training command: + +``` +./zipformer/train.py \ +--world-size 8 \ +--num-epochs 20 \ +--start-epoch 9 \ +--use-fp16 1 \ +--exp-dir zipformer/exp \ +--lang-dir data/lang/bpe_2000/ +``` + +The decoding command: + +``` +./zipformer/decode.py \ + --epoch 20 \ + --exp-dir ./zipformer/exp \ + --lang-dir data/lang/bpe_2000/ \ + --decoding-method greedy_search +``` + + +The pre-trained model is available here : [reazon-research/mls-english +](https://huggingface.co/reazon-research/mls-english) + + +Please note that this recipe was developed primarily as the source of English input in the bilingual Japanese-English recipe `multi_ja_en`, which uses ReazonSpeech and MLS English. diff --git a/egs/mls_english/ASR/local/compute_fbank_musan.py b/egs/mls_english/ASR/local/compute_fbank_musan.py new file mode 120000 index 000000000..5833f2484 --- /dev/null +++ b/egs/mls_english/ASR/local/compute_fbank_musan.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compute_fbank_musan.py \ No newline at end of file diff --git a/egs/mls_english/ASR/local/utils/asr_datamodule.py b/egs/mls_english/ASR/local/utils/asr_datamodule.py index 250b40a63..f1417c54b 100644 --- a/egs/mls_english/ASR/local/utils/asr_datamodule.py +++ b/egs/mls_english/ASR/local/utils/asr_datamodule.py @@ -180,7 +180,10 @@ class MLSEnglishHFAsrDataModule: ) def train_dataloaders( - self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None + self, + cuts_train: CutSet, + sampler_state_dict: Optional[Dict[str, Any]] = None, + cuts_musan: Optional[CutSet] = None, ) -> DataLoader: """ Args: @@ -191,6 +194,13 @@ class MLSEnglishHFAsrDataModule: """ transforms = [] + if cuts_musan is not None: + logging.info("Enable MUSAN") + transforms.append( + CutMix(cuts=cuts_musan, p=0.5, snr=(10,20), preserve_id=True) + ) + else: + logging.info("Disable MUSAN") input_transforms = [] if self.args.enable_spec_aug: @@ -337,19 +347,19 @@ class MLSEnglishHFAsrDataModule: def train_cuts(self) -> CutSet: logging.info("About to get train cuts") return load_manifest_lazy( - self.args.manifest_dir / "mls_english_cuts_train.jsonl.gz" + self.args.manifest_dir / "mls_eng_cuts_train.jsonl.gz" ) @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") return load_manifest_lazy( - self.args.manifest_dir / "mls_english_cuts_dev.jsonl.gz" + self.args.manifest_dir / "mls_eng_cuts_dev.jsonl.gz" ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") return load_manifest_lazy( - self.args.manifest_dir / "mls_english_cuts_test.jsonl.gz" + self.args.manifest_dir / "mls_eng_cuts_test.jsonl.gz" ) diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index c6582f679..c9afca976 100755 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -16,6 +16,14 @@ vocab_sizes=(2000) # You can add more sizes like (500 1000 2000) for comparison # Directory where dataset will be downloaded dl_dir=$PWD/download +# - $dl_dir/musan +# This directory contains the following directories downloaded from +# http://www.openslr.org/17/ +# +# - music +# - noise +# - speech + . shared/parse_options.sh || exit 1 # All files generated by this script are saved in "data". @@ -32,7 +40,7 @@ log() { log "Starting MLS English data preparation" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download MLS English dataset" +log "Stage 0: Download data" # Check if huggingface_hub is installed if ! python -c "import huggingface_hub" &> /dev/null; then log "huggingface_hub Python library not found. Installing it now..." @@ -55,6 +63,15 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then else log "Dataset already exists at $dl_dir/mls_english. Skipping download." fi + # If you ha`ve predownloaded it to /path/to/musan, + # you can create a symlink + # + # ln -sfv /path/to/musan $dl_dir/ + # + if [ ! -d $dl_dir/musan ] ; then + log "Downloading musan." + lhotse download musan $dl_dir + fi fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then @@ -73,7 +90,25 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Prepare transcript for BPE training" + log "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to $dl_dir/musan + if [ ! -e data/manifests/.musan_prep.done ]; then + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan_prep.done + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute fbank for musan" + if [ ! -e data/manifests/.musan_fbank.done ]; then + ./local/compute_fbank_musan.py + touch data/manifests/.musan_fbank.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Prepare transcript for BPE training" if [ ! -f data/lang/transcript.txt ]; then log "Generating transcripts for BPE training" python local/utils/generate_transcript.py \ @@ -83,8 +118,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then fi fi -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Prepare BPE tokenizer" +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare BPE tokenizer" for vocab_size in ${vocab_sizes[@]}; do log "Training BPE model with vocab_size=${vocab_size}" bpe_dir=data/lang/bpe_${vocab_size} @@ -99,8 +134,8 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then done fi -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Show manifest statistics" +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Show manifest statistics" python local/display_manifest_statistics.py --manifest-dir data/manifests > data/manifests/manifest_statistics.txt cat data/manifests/manifest_statistics.txt fi diff --git a/egs/mls_english/ASR/zipformer/decode.py b/egs/mls_english/ASR/zipformer/decode.py index fc8de5d64..220cdcc9d 100755 --- a/egs/mls_english/ASR/zipformer/decode.py +++ b/egs/mls_english/ASR/zipformer/decode.py @@ -1044,13 +1044,13 @@ def main(): # we need cut ids to display recognition results. args.return_cuts = True mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_dataset(args.dataset_path) # # dev_cuts = mls_english_corpus.dev_cuts() # test_cuts = mls_english_corpus.test_cuts() # dev_dl = mls_english_corpus.test_dataloader() - test_dl = mls_english_corpus.test_dataloader() + test_cuts = mls_english_corpus.test_cuts() + test_dl = mls_english_corpus.test_dataloaders(test_cuts) test_sets = ["test"] test_dls = [test_dl] diff --git a/egs/mls_english/ASR/zipformer/train.py b/egs/mls_english/ASR/zipformer/train.py index 7c6997656..63020abfb 100755 --- a/egs/mls_english/ASR/zipformer/train.py +++ b/egs/mls_english/ASR/zipformer/train.py @@ -68,6 +68,7 @@ from joiner import Joiner from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed +from lhotse import load_manifest from model import AsrModel from optim import Eden, ScaledAdam from scaling import ScheduledFloat @@ -1215,11 +1216,8 @@ def run(rank, world_size, args): return True mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_dataset(args.dataset_path) - - # train_cuts = mls_english_corpus.train_cuts() - - # train_cuts = train_cuts.filter(remove_short_and_long_utt) + train_cuts = mls_english_corpus.train_cuts() + # mls_english_corpus.load_dataset(args.dataset_path) if params.start_batch > 0 and checkpoints and "sampler" in checkpoints: # We only load the sampler's state dict when it loads a checkpoint @@ -1227,17 +1225,23 @@ def run(rank, world_size, args): sampler_state_dict = checkpoints["sampler"] else: sampler_state_dict = None + + if args.enable_musan: + musan_path = Path(args.manifest_dir) / "musan_cuts.jsonl.gz" + if musan_path.exists(): + cuts_musan = load_manifest(musan_path) + logging.info(f"Loaded MUSAN manifest from {musan_path}") + else: + logging.warning(f"MUSAN manifest not found at {musan_path}, disabling MUSAN augmentation") + cuts_musan = None + else: + cuts_musan = None - # train_dl = mls_english_corpus.train_dataloaders( - # train_cuts, sampler_state_dict=sampler_state_dict - # ) - train_dl = mls_english_corpus.train_dataloader( - sampler_state_dict=sampler_state_dict + train_dl = mls_english_corpus.train_dataloaders( + train_cuts, sampler_state_dict=sampler_state_dict ) - - # valid_cuts = mls_english_corpus.valid_cuts() - # valid_dl = mls_english_corpus.valid_dataloader(valid_cuts) - valid_dl = mls_english_corpus.valid_dataloader() + valid_cuts = mls_english_corpus.valid_cuts() + valid_dl = mls_english_corpus.valid_dataloaders(valid_cuts) if not params.print_diagnostics: scan_pessimistic_batches_for_oom( diff --git a/egs/multi_ja_en/ASR/zipformer/train.py b/egs/multi_ja_en/ASR/zipformer/train.py index c4aaa17db..1c14b4aa4 100755 --- a/egs/multi_ja_en/ASR/zipformer/train.py +++ b/egs/multi_ja_en/ASR/zipformer/train.py @@ -1185,6 +1185,7 @@ def run(rank, world_size, args): train_cuts = multi_dataset.train_cuts() def remove_short_and_long_utt(c: Cut): + # Keep only utterances greater than 1 second # # You should use ../local/display_manifest_statistics.py to get @@ -1241,6 +1242,7 @@ def run(rank, world_size, args): ) valid_cuts = multi_dataset.dev_cuts() + valid_dl = multidataset_datamodule.valid_dataloaders(valid_cuts) if not params.print_diagnostics: diff --git a/egs/tedlium3/ASR/conformer_ctc2/transformer.py b/egs/tedlium3/ASR/conformer_ctc2/transformer.py index 9dbf32e48..804c92957 100644 --- a/egs/tedlium3/ASR/conformer_ctc2/transformer.py +++ b/egs/tedlium3/ASR/conformer_ctc2/transformer.py @@ -612,6 +612,7 @@ class TransformerDecoderLayer(nn.Module): tgt_key_padding_mask: Optional[torch.Tensor] = None, memory_key_padding_mask: Optional[torch.Tensor] = None, warmup: float = 1.0, + **kwargs, ) -> torch.Tensor: """Pass the inputs (and mask) through the decoder layer. diff --git a/icefall/utils.py b/icefall/utils.py index 427755090..a04bedffd 100644 --- a/icefall/utils.py +++ b/icefall/utils.py @@ -1391,13 +1391,20 @@ def add_eos(ragged: k2.RaggedTensor, eos_id: int) -> k2.RaggedTensor: return concat(ragged, eos_id, direction="right") -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: +def make_pad_mask( + lengths: torch.Tensor, + max_len: int = 0, + pad_left: bool = False, +) -> torch.Tensor: """ Args: lengths: A 1-D tensor containing sentence lengths. max_len: The length of masks. + pad_left: + If ``False`` (default), padding is on the right. + If ``True``, padding is on the left. Returns: Return a 2-D bool tensor, where masked positions are filled with `True` and non-masked positions are @@ -1414,9 +1421,14 @@ def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: max_len = max(max_len, lengths.max()) n = lengths.size(0) seq_range = torch.arange(0, max_len, device=lengths.device) - expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len) + expanded_lengths = seq_range.unsqueeze(0).expand(n, max_len) - return expaned_lengths >= lengths.unsqueeze(-1) + if pad_left: + mask = expanded_lengths < (max_len - lengths).unsqueeze(1) + else: + mask = expanded_lengths >= lengths.unsqueeze(-1) + + return mask # Copied and modified from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/mask.py