diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index 78f169bd1..c9afca976 100755 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -40,7 +40,7 @@ log() { log "Starting MLS English data preparation" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" +log "Stage 0: Download data" # Check if huggingface_hub is installed if ! python -c "import huggingface_hub" &> /dev/null; then log "huggingface_hub Python library not found. Installing it now..." diff --git a/egs/mls_english/ASR/zipformer/train.py b/egs/mls_english/ASR/zipformer/train.py index 2bd51ec49..9b101f1ce 100755 --- a/egs/mls_english/ASR/zipformer/train.py +++ b/egs/mls_english/ASR/zipformer/train.py @@ -1219,7 +1219,6 @@ def run(rank, world_size, args): train_cuts = mls_english_corpus.train_cuts() # mls_english_corpus.load_dataset(args.dataset_path) - if params.start_batch > 0 and checkpoints and "sampler" in checkpoints: # We only load the sampler's state dict when it loads a checkpoint # saved in the middle of an epoch @@ -1241,7 +1240,6 @@ def run(rank, world_size, args): train_dl = mls_english_corpus.train_dataloaders( train_cuts, sampler_state_dict=sampler_state_dict ) - valid_dl = mls_english_corpus.valid_dataloader() if not params.print_diagnostics: diff --git a/egs/multi_ja_en/ASR/zipformer/train.py b/egs/multi_ja_en/ASR/zipformer/train.py index 76149031e..1c14b4aa4 100755 --- a/egs/multi_ja_en/ASR/zipformer/train.py +++ b/egs/multi_ja_en/ASR/zipformer/train.py @@ -791,7 +791,7 @@ def compute_loss( warm_step = params.warm_step texts = batch["supervisions"]["text"] - y = sentencepiece_processor.encode(texts, out_type=int) + y = sentencepiece_processor.encode(texts, out_type=int) y = k2.RaggedTensor(y) with torch.set_grad_enabled(is_training): @@ -1120,7 +1120,7 @@ def run(rank, world_size, args): # is defined in local/prepare_lang_char.py params.blank_id = sentencepiece_processor.piece_to_id("") - arams.vocab_size = sentencepiece_processor.get_piece_size() + params.vocab_size = sentencepiece_processor.get_piece_size() if not params.use_transducer: params.ctc_loss_scale = 1.0 @@ -1178,22 +1178,20 @@ def run(rank, world_size, args): if params.inf_check: register_inf_check_hooks(model) - reazonspeech_corpus = ReazonSpeechAsrDataModule(args) + multidataset_datamodule = MultiDatasetAsrDataModule(args) multi_dataset = MultiDataset(args) train_cuts = multi_dataset.train_cuts() def remove_short_and_long_utt(c: Cut): - # Keep only utterances with duration between 1 second and 30 seconds - # - # Caution: There is a reason to select 30.0 here. Please see - # ../local/display_manifest_statistics.py + + # Keep only utterances greater than 1 second # # You should use ../local/display_manifest_statistics.py to get # an utterance duration distribution for your dataset to select - # the threshold - if c.duration < 1.0 or c.duration > 30.0: + # the threshold as this is dependent on which datasets you choose + if c.duration < 1.0: logging.warning( f"Exclude cut with ID {c.id} from training. Duration: {c.duration}" ) @@ -1244,7 +1242,8 @@ def run(rank, world_size, args): ) valid_cuts = multi_dataset.dev_cuts() - valid_dl = reazonspeech_corpus.valid_dataloaders(valid_cuts) + + valid_dl = multidataset_datamodule.valid_dataloaders(valid_cuts) if not params.print_diagnostics: scan_pessimistic_batches_for_oom( @@ -1386,7 +1385,7 @@ def scan_pessimistic_batches_for_oom( def main(): parser = get_parser() - ReazonSpeechAsrDataModule.add_arguments(parser) + MultiDatasetAsrDataModule.add_arguments(parser) args = parser.parse_args() args.exp_dir = Path(args.exp_dir)