From d93512344b62a55efa1a75e78e35bc80b8fc1634 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 5 Jun 2022 13:10:06 +0800 Subject: [PATCH] Use jsonl for cutsets in the librispeech recipe. --- .../workflows/run-librispeech-2022-03-12.yml | 2 +- .../workflows/run-librispeech-2022-04-29.yml | 2 +- .../workflows/run-librispeech-2022-05-13.yml | 2 +- ...runed-transducer-stateless3-2022-05-13.yml | 2 +- ...peech-transducer-stateless2-2022-04-19.yml | 2 +- ...-transducer-stateless-librispeech-100h.yml | 2 +- ...r-stateless-librispeech-multi-datasets.yml | 2 +- .../run-pretrained-transducer-stateless.yml | 2 +- egs/librispeech/ASR/conformer_ctc/train.py | 24 ++++++++++++++++++ .../ASR/local/compute_fbank_librispeech.py | 18 ++++++++----- .../ASR/local/compute_fbank_musan.py | 22 +++++++++++----- .../ASR/local/validate_manifest.py | 4 +-- egs/librispeech/ASR/prepare.sh | 8 +++--- .../ASR/pruned_transducer_stateless/train.py | 20 --------------- .../ASR/tdnn_lstm_ctc/asr_datamodule.py | 24 ++++++++++++------ egs/librispeech/ASR/tdnn_lstm_ctc/train.py | 25 +++++++++++++++++++ 16 files changed, 107 insertions(+), 54 deletions(-) diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml index b18b84378..7934596e1 100644 --- a/.github/workflows/run-librispeech-2022-03-12.yml +++ b/.github/workflows/run-librispeech-2022-03-12.yml @@ -99,7 +99,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml index 6c8188b48..8af2c2377 100644 --- a/.github/workflows/run-librispeech-2022-04-29.yml +++ b/.github/workflows/run-librispeech-2022-04-29.yml @@ -99,7 +99,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml index 2290e18d4..7080e86f6 100644 --- a/.github/workflows/run-librispeech-2022-05-13.yml +++ b/.github/workflows/run-librispeech-2022-05-13.yml @@ -99,7 +99,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml index 512f1b334..4861dcebc 100644 --- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml +++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml @@ -99,7 +99,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml index 3864f4aa3..2711f96fe 100644 --- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml +++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml @@ -99,7 +99,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml index f77d9e658..f5329864d 100644 --- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml +++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml @@ -98,7 +98,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml index ddfa62073..7084a24ae 100644 --- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml +++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml @@ -98,7 +98,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml index cdea78a88..78a74dfc7 100644 --- a/.github/workflows/run-pretrained-transducer-stateless.yml +++ b/.github/workflows/run-pretrained-transducer-stateless.yml @@ -98,7 +98,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index b81bd6330..5ac74f932 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -17,6 +17,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Usage: + export CUDA_VISIBLE_DEVICES="0,1,2,3" + ./conformer_ctc/train.py \ + --exp-dir ./conformer_ctc/exp \ + --world-size 4 \ + --full-libri 1 \ + --max-duration 200 \ + --num-epochs 20 +""" + import argparse import logging from pathlib import Path @@ -29,6 +40,7 @@ import torch.multiprocessing as mp import torch.nn as nn from asr_datamodule import LibriSpeechAsrDataModule from conformer import Conformer +from lhotse.cut import Cut from lhotse.utils import fix_random_seed from torch import Tensor from torch.nn.parallel import DistributedDataParallel as DDP @@ -676,6 +688,18 @@ def run(rank, world_size, args): if params.full_libri: train_cuts += librispeech.train_clean_360_cuts() train_cuts += librispeech.train_other_500_cuts() + + def remove_short_and_long_utt(c: Cut): + # Keep only utterances with duration between 1 second and 20 seconds + # + # Caution: There is a reason to select 20.0 here. Please see + # ../local/display_manifest_statistics.py + # + # You should use ../local/display_manifest_statistics.py to get + # an utterance duration distribution for your dataset to select + # the threshold + return 1.0 <= c.duration <= 20.0 + train_dl = librispeech.train_dataloaders(train_cuts) valid_cuts = librispeech.dev_clean_cuts() diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py index 92f4f6ab7..642d9fd32 100755 --- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py +++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py @@ -28,7 +28,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -56,8 +56,13 @@ def compute_fbank_librispeech(): "train-clean-360", "train-other-500", ) + prefix = "librispeech" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -65,7 +70,8 @@ def compute_fbank_librispeech(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + cuts_filename = f"{prefix}_cuts_{partition}.{suffix}" + if (output_dir / cuts_filename).is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -81,13 +87,13 @@ def compute_fbank_librispeech(): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(output_dir / cuts_filename) if __name__ == "__main__": diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py index 368bea4e8..fef372129 100755 --- a/egs/librispeech/ASR/local/compute_fbank_musan.py +++ b/egs/librispeech/ASR/local/compute_fbank_musan.py @@ -28,7 +28,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -52,12 +52,22 @@ def compute_fbank_musan(): "speech", "noise", ) + prefix = "musan" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None - musan_cuts_path = output_dir / "cuts_musan.json.gz" + assert len(manifests) == len(dataset_parts), ( + len(manifests), + len(dataset_parts), + ) + + musan_cuts_path = output_dir / "musan_cuts.jsonl.gz" if musan_cuts_path.is_file(): logging.info(f"{musan_cuts_path} already exists - skipping") @@ -79,13 +89,13 @@ def compute_fbank_musan(): .filter(lambda c: c.duration > 5) .compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_musan", + storage_path=f"{output_dir}/musan_feats", num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) ) - musan_cuts.to_json(musan_cuts_path) + musan_cuts.to_file(musan_cuts_path) if __name__ == "__main__": diff --git a/egs/librispeech/ASR/local/validate_manifest.py b/egs/librispeech/ASR/local/validate_manifest.py index 8d3d4c7ce..4ccc1d353 100755 --- a/egs/librispeech/ASR/local/validate_manifest.py +++ b/egs/librispeech/ASR/local/validate_manifest.py @@ -25,7 +25,7 @@ We will add more checks later if needed. Usage example: python3 ./local/validate_manifest.py \ - ./data/fbank/cuts_train-clean-100.json.gz + ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz """ @@ -33,7 +33,7 @@ import argparse import logging from pathlib import Path -from lhotse import load_manifest, CutSet +from lhotse import CutSet, load_manifest from lhotse.cut import Cut diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 8cfb046c8..17a638502 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -40,9 +40,9 @@ dl_dir=$PWD/download # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( - 5000 - 2000 - 1000 + # 5000 + # 2000 + # 1000 500 ) @@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then ) for part in ${parts[@]}; do python3 ./local/validate_manifest.py \ - data/fbank/cuts_${part}.json.gz + data/fbank/librispeech_cuts_${part}.jsonl.gz done touch data/fbank/.librispeech-validated.done fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py index c360d025a..e6795330f 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py @@ -807,28 +807,8 @@ def run(rank, world_size, args): # the threshold return 1.0 <= c.duration <= 20.0 - num_in_total = len(train_cuts) - train_cuts = train_cuts.filter(remove_short_and_long_utt) - try: - num_left = len(train_cuts) - num_removed = num_in_total - num_left - removed_percent = num_removed / num_in_total * 100 - - logging.info( - f"Before removing short and long utterances: {num_in_total}" - ) - logging.info(f"After removing short and long utterances: {num_left}") - logging.info( - f"Removed {num_removed} utterances ({removed_percent:.5f}%)" - ) - except TypeError as e: - # You can ignore this error as previous versions of Lhotse work fine - # for the above code. In recent versions of Lhotse, it uses - # lazy filter, producing cutsets that don't have the __len__ method - logging.info(str(e)) - if params.start_batch > 0 and checkpoints and "sampler" in checkpoints: # We only load the sampler's state dict when it loads a checkpoint # saved in the middle of an epoch diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py index 7628c8274..a4c4e1706 100644 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -225,7 +225,7 @@ class LibriSpeechAsrDataModule: logging.info("Enable MUSAN") logging.info("About to get Musan cuts") cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( CutMix( @@ -408,39 +408,47 @@ class LibriSpeechAsrDataModule: def train_clean_100_cuts(self) -> CutSet: logging.info("About to get train-clean-100 cuts") return load_manifest( - self.args.manifest_dir / "cuts_train-clean-100.json.gz" + self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" ) @lru_cache() def train_clean_360_cuts(self) -> CutSet: logging.info("About to get train-clean-360 cuts") return load_manifest( - self.args.manifest_dir / "cuts_train-clean-360.json.gz" + self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" ) @lru_cache() def train_other_500_cuts(self) -> CutSet: logging.info("About to get train-other-500 cuts") return load_manifest( - self.args.manifest_dir / "cuts_train-other-500.json.gz" + self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" ) @lru_cache() def dev_clean_cuts(self) -> CutSet: logging.info("About to get dev-clean cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz") + return load_manifest( + self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" + ) @lru_cache() def dev_other_cuts(self) -> CutSet: logging.info("About to get dev-other cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz") + return load_manifest( + self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" + ) @lru_cache() def test_clean_cuts(self) -> CutSet: logging.info("About to get test-clean cuts") - return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz") + return load_manifest( + self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" + ) @lru_cache() def test_other_cuts(self) -> CutSet: logging.info("About to get test-other cuts") - return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz") + return load_manifest( + self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" + ) diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py index 8597525ba..827e3ae1f 100755 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py @@ -16,6 +16,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Usage: + export CUDA_VISIBLE_DEVICES="0,1,2,3" + ./tdnn_lstm_ctc/train.py \ + --world-size 4 \ + --full-libri 1 \ + --max-duration 300 \ + --num-epochs 20 +""" import argparse import logging @@ -29,6 +38,7 @@ import torch.multiprocessing as mp import torch.nn as nn import torch.optim as optim from asr_datamodule import LibriSpeechAsrDataModule +from lhotse.cut import Cut from lhotse.utils import fix_random_seed from model import TdnnLstm from torch import Tensor @@ -544,10 +554,25 @@ def run(rank, world_size, args): if params.full_libri: train_cuts += librispeech.train_clean_360_cuts() train_cuts += librispeech.train_other_500_cuts() + + def remove_short_and_long_utt(c: Cut): + # Keep only utterances with duration between 1 second and 20 seconds + # + # Caution: There is a reason to select 20.0 here. Please see + # ../local/display_manifest_statistics.py + # + # You should use ../local/display_manifest_statistics.py to get + # an utterance duration distribution for your dataset to select + # the threshold + return 1.0 <= c.duration <= 20.0 + + train_cuts = train_cuts.filter(remove_short_and_long_utt) + train_dl = librispeech.train_dataloaders(train_cuts) valid_cuts = librispeech.dev_clean_cuts() valid_cuts += librispeech.dev_other_cuts() + valid_dl = librispeech.valid_dataloaders(valid_cuts) for epoch in range(params.start_epoch, params.num_epochs):