From d93512344b62a55efa1a75e78e35bc80b8fc1634 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Sun, 5 Jun 2022 13:10:06 +0800
Subject: [PATCH] Use jsonl for cutsets in the librispeech recipe.

---
 .../workflows/run-librispeech-2022-03-12.yml  |  2 +-
 .../workflows/run-librispeech-2022-04-29.yml  |  2 +-
 .../workflows/run-librispeech-2022-05-13.yml  |  2 +-
 ...runed-transducer-stateless3-2022-05-13.yml |  2 +-
 ...peech-transducer-stateless2-2022-04-19.yml |  2 +-
 ...-transducer-stateless-librispeech-100h.yml |  2 +-
 ...r-stateless-librispeech-multi-datasets.yml |  2 +-
 .../run-pretrained-transducer-stateless.yml   |  2 +-
 egs/librispeech/ASR/conformer_ctc/train.py    | 24 ++++++++++++++++++
 .../ASR/local/compute_fbank_librispeech.py    | 18 ++++++++-----
 .../ASR/local/compute_fbank_musan.py          | 22 +++++++++++-----
 .../ASR/local/validate_manifest.py            |  4 +--
 egs/librispeech/ASR/prepare.sh                |  8 +++---
 .../ASR/pruned_transducer_stateless/train.py  | 20 ---------------
 .../ASR/tdnn_lstm_ctc/asr_datamodule.py       | 24 ++++++++++++------
 egs/librispeech/ASR/tdnn_lstm_ctc/train.py    | 25 +++++++++++++++++++
 16 files changed, 107 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml
index b18b84378..7934596e1 100644
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@@ -99,7 +99,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml
index 6c8188b48..8af2c2377 100644
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@@ -99,7 +99,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml
index 2290e18d4..7080e86f6 100644
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@@ -99,7 +99,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
index 512f1b334..4861dcebc 100644
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@@ -99,7 +99,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
index 3864f4aa3..2711f96fe 100644
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@@ -99,7 +99,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
index f77d9e658..f5329864d 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@@ -98,7 +98,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
index ddfa62073..7084a24ae 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@@ -98,7 +98,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml
index cdea78a88..78a74dfc7 100644
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@@ -98,7 +98,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py
index b81bd6330..5ac74f932 100755
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@@ -17,6 +17,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Usage:
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./conformer_ctc/train.py \
+     --exp-dir ./conformer_ctc/exp \
+     --world-size 4 \
+     --full-libri 1 \
+     --max-duration 200 \
+     --num-epochs 20
+"""
+
 import argparse
 import logging
 from pathlib import Path
@@ -29,6 +40,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
+from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -676,6 +688,18 @@ def run(rank, world_size, args):
     if params.full_libri:
         train_cuts += librispeech.train_clean_360_cuts()
         train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        return 1.0 <= c.duration <= 20.0
+
     train_dl = librispeech.train_dataloaders(train_cuts)
 
     valid_cuts = librispeech.dev_clean_cuts()
diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
index 92f4f6ab7..642d9fd32 100755
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@@ -28,7 +28,7 @@ import os
 from pathlib import Path
 
 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor
@@ -56,8 +56,13 @@ def compute_fbank_librispeech():
         "train-clean-360",
         "train-other-500",
     )
+    prefix = "librispeech"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -65,7 +70,8 @@ def compute_fbank_librispeech():
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+            if (output_dir / cuts_filename).is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
@@ -81,13 +87,13 @@ def compute_fbank_librispeech():
                 )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / cuts_filename)
 
 
 if __name__ == "__main__":
diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py
index 368bea4e8..fef372129 100755
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@@ -28,7 +28,7 @@ import os
 from pathlib import Path
 
 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor
@@ -52,12 +52,22 @@ def compute_fbank_musan():
         "speech",
         "noise",
     )
+    prefix = "musan"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
-    musan_cuts_path = output_dir / "cuts_musan.json.gz"
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+    )
+
+    musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
 
     if musan_cuts_path.is_file():
         logging.info(f"{musan_cuts_path} already exists - skipping")
@@ -79,13 +89,13 @@ def compute_fbank_musan():
             .filter(lambda c: c.duration > 5)
             .compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_musan",
+                storage_path=f"{output_dir}/musan_feats",
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
             )
         )
-        musan_cuts.to_json(musan_cuts_path)
+        musan_cuts.to_file(musan_cuts_path)
 
 
 if __name__ == "__main__":
diff --git a/egs/librispeech/ASR/local/validate_manifest.py b/egs/librispeech/ASR/local/validate_manifest.py
index 8d3d4c7ce..4ccc1d353 100755
--- a/egs/librispeech/ASR/local/validate_manifest.py
+++ b/egs/librispeech/ASR/local/validate_manifest.py
@@ -25,7 +25,7 @@ We will add more checks later if needed.
 Usage example:
 
     python3 ./local/validate_manifest.py \
-            ./data/fbank/cuts_train-clean-100.json.gz
+            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
 
 """
 
@@ -33,7 +33,7 @@ import argparse
 import logging
 from pathlib import Path
 
-from lhotse import load_manifest, CutSet
+from lhotse import CutSet, load_manifest
 from lhotse.cut import Cut
 
 
diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
index 8cfb046c8..17a638502 100755
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@@ -40,9 +40,9 @@ dl_dir=$PWD/download
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-  5000
-  2000
-  1000
+  # 5000
+  # 2000
+  # 1000
   500
 )
 
@@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
     )
     for part in ${parts[@]}; do
       python3 ./local/validate_manifest.py \
-        data/fbank/cuts_${part}.json.gz
+        data/fbank/librispeech_cuts_${part}.jsonl.gz
     done
     touch data/fbank/.librispeech-validated.done
   fi
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
index c360d025a..e6795330f 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@@ -807,28 +807,8 @@ def run(rank, world_size, args):
         # the threshold
         return 1.0 <= c.duration <= 20.0
 
-    num_in_total = len(train_cuts)
-
     train_cuts = train_cuts.filter(remove_short_and_long_utt)
 
-    try:
-        num_left = len(train_cuts)
-        num_removed = num_in_total - num_left
-        removed_percent = num_removed / num_in_total * 100
-
-        logging.info(
-            f"Before removing short and long utterances: {num_in_total}"
-        )
-        logging.info(f"After removing short and long utterances: {num_left}")
-        logging.info(
-            f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
-        )
-    except TypeError as e:
-        # You can ignore this error as previous versions of Lhotse work fine
-        # for the above code. In recent versions of Lhotse, it uses
-        # lazy filter, producing cutsets that don't have the __len__  method
-        logging.info(str(e))
-
     if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
         # We only load the sampler's state dict when it loads a checkpoint
         # saved in the middle of an epoch
diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
index 7628c8274..a4c4e1706 100644
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -225,7 +225,7 @@ class LibriSpeechAsrDataModule:
             logging.info("Enable MUSAN")
             logging.info("About to get Musan cuts")
             cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
             )
             transforms.append(
                 CutMix(
@@ -408,39 +408,47 @@ class LibriSpeechAsrDataModule:
     def train_clean_100_cuts(self) -> CutSet:
         logging.info("About to get train-clean-100 cuts")
         return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-100.json.gz"
+            self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
         )
 
     @lru_cache()
     def train_clean_360_cuts(self) -> CutSet:
         logging.info("About to get train-clean-360 cuts")
         return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-360.json.gz"
+            self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
         )
 
     @lru_cache()
     def train_other_500_cuts(self) -> CutSet:
         logging.info("About to get train-other-500 cuts")
         return load_manifest(
-            self.args.manifest_dir / "cuts_train-other-500.json.gz"
+            self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
         )
 
     @lru_cache()
     def dev_clean_cuts(self) -> CutSet:
         logging.info("About to get dev-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
+        return load_manifest(
+            self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
+        )
 
     @lru_cache()
     def dev_other_cuts(self) -> CutSet:
         logging.info("About to get dev-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
+        return load_manifest(
+            self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
+        )
 
     @lru_cache()
     def test_clean_cuts(self) -> CutSet:
         logging.info("About to get test-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
+        return load_manifest(
+            self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
+        )
 
     @lru_cache()
     def test_other_cuts(self) -> CutSet:
         logging.info("About to get test-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
+        return load_manifest(
+            self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
+        )
diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
index 8597525ba..827e3ae1f 100755
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
@@ -16,6 +16,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Usage:
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./tdnn_lstm_ctc/train.py \
+     --world-size 4 \
+     --full-libri 1 \
+     --max-duration 300 \
+     --num-epochs 20
+"""
 
 import argparse
 import logging
@@ -29,6 +38,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
 from asr_datamodule import LibriSpeechAsrDataModule
+from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from model import TdnnLstm
 from torch import Tensor
@@ -544,10 +554,25 @@ def run(rank, world_size, args):
     if params.full_libri:
         train_cuts += librispeech.train_clean_360_cuts()
         train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        return 1.0 <= c.duration <= 20.0
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
     train_dl = librispeech.train_dataloaders(train_cuts)
 
     valid_cuts = librispeech.dev_clean_cuts()
     valid_cuts += librispeech.dev_other_cuts()
+
     valid_dl = librispeech.valid_dataloaders(valid_cuts)
 
     for epoch in range(params.start_epoch, params.num_epochs):