shuffle full Librispeech data (#574)

* shuffled full/partial librispeech data * fixed the code style issue * Shuffled full librispeech data off-line * Fixed style, addressed comments, and removed redandunt codes * Used the suggested version of black * Propagated the changes to other folders for librispeech (except conformer_mmi and streaming_conformer_ctc)
2025-12-11 06:55:27 +00:00 · 2022-11-26 22:26:09 -05:00 · 2022-11-26 22:26:09 -05:00 · 6693d907d3
commit 6693d907d3
parent 61032e70e0
21 changed files with 73 additions and 58 deletions
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@ -687,10 +687,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/conformer_ctc2/train.py
+++ b/egs/librispeech/ASR/conformer_ctc2/train.py
@ -928,10 +928,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
@ -970,10 +970,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py
@ -970,10 +970,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@ -954,10 +954,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
@ -1108,10 +1108,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeech(manifest_dir=args.manifest_dir)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    train_cuts = filter_short_and_long_utterances(train_cuts, sp)

--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -123,6 +123,11 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
    touch data/fbank/.librispeech.done
  fi

+  cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \
+    <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \
+    <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \
+    shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz
+
  if [ ! -e data/fbank/.librispeech-validated.done ]; then
    log "Validating data/fbank for LibriSpeech"
    parts=(
--- a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py
+++ b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py
@ -882,10 +882,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@ -873,10 +873,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
@ -931,10 +931,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@ -1065,10 +1065,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeech(manifest_dir=args.manifest_dir)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    train_cuts = filter_short_and_long_utterances(train_cuts, sp)

--- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
@ -978,10 +978,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
@ -1009,10 +1009,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
@ -970,10 +970,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -414,6 +414,16 @@ class LibriSpeechAsrDataModule:
            self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
        )

+    @lru_cache()
+    def train_all_shuf_cuts(self) -> CutSet:
+        logging.info(
+            "About to get the shuffled train-clean-100, \
+            train-clean-360 and train-other-500 cuts"
+        )
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-all-shuf.jsonl.gz"
+        )
+
    @lru_cache()
    def dev_clean_cuts(self) -> CutSet:
        logging.info("About to get dev-clean cuts")
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
@ -173,7 +173,7 @@ def get_params() -> AttributeDict:
        {
            "exp_dir": Path("tdnn_lstm_ctc/exp"),
            "lang_dir": Path("data/lang_phone"),
-            "lr": 1e-3,
+            "lr": 1e-4,
            "feature_dim": 80,
            "weight_decay": 5e-4,
            "subsampling_factor": 3,
@ -557,10 +557,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/transducer/train.py
+++ b/egs/librispeech/ASR/transducer/train.py
@ -614,10 +614,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/transducer_lstm/train.py
+++ b/egs/librispeech/ASR/transducer_lstm/train.py
@ -620,10 +620,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/transducer_stateless/train.py
+++ b/egs/librispeech/ASR/transducer_stateless/train.py
@ -641,10 +641,10 @@ def run(rank, world_size, args):
    if params.print_diagnostics:
        diagnostic = diagnostics.attach_diagnostics(model)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/transducer_stateless2/train.py
@ -629,10 +629,10 @@ def run(rank, world_size, args):
    if params.print_diagnostics:
        diagnostic = diagnostics.attach_diagnostics(model)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
@ -752,10 +752,10 @@ def run(rank, world_size, args):

    librispeech = LibriSpeech(manifest_dir=args.manifest_dir)

-    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
-        train_cuts += librispeech.train_clean_360_cuts()
-        train_cuts += librispeech.train_other_500_cuts()
+        train_cuts = librispeech.train_all_shuf_cuts()
+    else:
+        train_cuts = librispeech.train_clean_100_cuts()

    train_cuts = filter_short_and_long_utterances(train_cuts)