diff --git a/egs/aishell4/ASR/local/compute_fbank_aishell4.py b/egs/aishell4/ASR/local/compute_fbank_aishell4.py index 6cefbfb92..590b56984 100755 --- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py +++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -53,11 +53,13 @@ def compute_fbank_aishell4(num_mel_bins: int = 80): "train_L", "test", ) + prefix = "aishell4" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=src_dir, - prefix="aishell4", - suffix="jsonl.gz", + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -65,7 +67,8 @@ def compute_fbank_aishell4(num_mel_bins: int = 80): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.jsonl").is_file(): + cuts_filename = f"{prefix}_cuts_{partition}.{suffix}" + if (output_dir / cuts_filename).is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -81,11 +84,11 @@ def compute_fbank_aishell4(num_mel_bins: int = 80): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) logging.info("About splitting cuts into smaller chunks") @@ -94,7 +97,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80): min_duration=None, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_json(output_dir / cuts_filename) def get_args(): diff --git a/egs/aishell4/ASR/prepare.sh b/egs/aishell4/ASR/prepare.sh index 3fe7b0f45..c351e3964 100755 --- a/egs/aishell4/ASR/prepare.sh +++ b/egs/aishell4/ASR/prepare.sh @@ -48,7 +48,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then if [ ! -f $dl_dir/aishell4/train_L ]; then lhotse download aishell4 $dl_dir/aishell4 fi - + # If you have pre-downloaded it to /path/to/musan, # you can create a symlink # @@ -117,9 +117,26 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then # Prepare text. # Note: in Linux, you can install jq with the following command: # wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 - gunzip -c data/manifests/aishell4/supervisions_train_L.jsonl.gz \ - | jq ".text" | sed 's/"//g' | sed 's///g' \ - | ./local/text2token.py -t "char" > $lang_char_dir/text + gunzip -c data/manifests/aishell4/aishell4_supervisions_train_S.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/text2token.py -t "char" > $lang_char_dir/text_S + + gunzip -c data/manifests/aishell4/aishell4_supervisions_train_M.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/text2token.py -t "char" > $lang_char_dir/text_M + + gunzip -c data/manifests/aishell4/aishell4_supervisions_train_L.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/text2token.py -t "char" > $lang_char_dir/text_L + + for r in text_S text_M text_L ; do + cat $lang_char_dir/$r >> $lang_char_dir/text_full + done + + # Prepare text normalize + python ./local/text_normalize.py \ + --input $lang_char_dir/text_full \ + --output $lang_char_dir/text # Prepare words segments python ./local/text2segments.py \ diff --git a/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py b/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py index 67b040387..e8079b63f 100644 --- a/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py +++ b/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py @@ -23,14 +23,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional import torch -from lhotse import ( - CutSet, - Fbank, - FbankConfig, - load_manifest, - set_caching_enabled, -) -from lhotse.dataset import ( +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures CutConcatenate, CutMix, DynamicBucketingSampler, @@ -39,15 +33,15 @@ from lhotse.dataset import ( SingleCutSampler, SpecAugment, ) -from lhotse.dataset.input_strategies import OnTheFlyFeatures +from lhotse.dataset.input_strategies import ( # noqa F401 for AudioSamples + AudioSamples, + OnTheFlyFeatures, +) from lhotse.utils import fix_random_seed from torch.utils.data import DataLoader from icefall.utils import str2bool -set_caching_enabled(False) -torch.set_num_threads(1) - class _SeedWorkers: def __init__(self, seed: int): @@ -85,12 +79,14 @@ class Aishell4AsrDataModule: "effective batch sizes, sampling strategies, applied data " "augmentations, etc.", ) + group.add_argument( "--manifest-dir", type=Path, default=Path("data/fbank"), help="Path to directory with train/valid/test cuts.", ) + group.add_argument( "--max-duration", type=int, @@ -98,6 +94,7 @@ class Aishell4AsrDataModule: help="Maximum pooled recordings duration (seconds) in a " "single batch. You can reduce it if it causes CUDA OOM.", ) + group.add_argument( "--bucketing-sampler", type=str2bool, @@ -105,6 +102,7 @@ class Aishell4AsrDataModule: help="When enabled, the batches will come from buckets of " "similar duration (saves padding frames).", ) + group.add_argument( "--num-buckets", type=int, @@ -112,6 +110,7 @@ class Aishell4AsrDataModule: help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) + group.add_argument( "--concatenate-cuts", type=str2bool, @@ -119,6 +118,7 @@ class Aishell4AsrDataModule: help="When enabled, utterances (cuts) will be concatenated " "to minimize the amount of padding.", ) + group.add_argument( "--duration-factor", type=float, @@ -126,6 +126,7 @@ class Aishell4AsrDataModule: help="Determines the maximum duration of a concatenated cut " "relative to the duration of the longest cut in a batch.", ) + group.add_argument( "--gap", type=float, @@ -134,6 +135,7 @@ class Aishell4AsrDataModule: "concatenated cuts. This padding is filled with noise when " "noise augmentation is used.", ) + group.add_argument( "--on-the-fly-feats", type=str2bool, @@ -142,6 +144,7 @@ class Aishell4AsrDataModule: "extraction. Will drop existing precomputed feature manifests " "if available.", ) + group.add_argument( "--shuffle", type=str2bool, @@ -149,6 +152,14 @@ class Aishell4AsrDataModule: help="When enabled (=default), the examples will be " "shuffled for each epoch.", ) + + group.add_argument( + "--drop-last", + type=str2bool, + default=True, + help="Whether to drop last batch. Used by sampler.", + ) + group.add_argument( "--return-cuts", type=str2bool, @@ -192,10 +203,10 @@ class Aishell4AsrDataModule: ) group.add_argument( - "--lazy-load", - type=str2bool, - default=True, - help="lazily open CutSets to avoid OOM (for L|XL subset)", + "--input-strategy", + type=str, + default="PrecomputedFeatures", + help="AudioSamples or PrecomputedFeatures", ) group.add_argument( @@ -218,8 +229,8 @@ class Aishell4AsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "cuts_musan.jsonl.gz" ) transforms = [] @@ -277,6 +288,7 @@ class Aishell4AsrDataModule: logging.info("About to create train dataset") train = K2SpeechRecognitionDataset( + input_strategy=eval(self.args.input_strategy)(), cut_transforms=transforms, input_transforms=input_transforms, return_cuts=self.args.return_cuts, @@ -310,7 +322,7 @@ class Aishell4AsrDataModule: shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, buffer_size=30000, - drop_last=True, + drop_last=self.args.drop_last, ) else: logging.info("Using SingleCutSampler.") @@ -367,8 +379,6 @@ class Aishell4AsrDataModule: valid_sampler = DynamicBucketingSampler( cuts_valid, max_duration=self.args.max_duration, - rank=0, - world_size=1, shuffle=False, ) logging.info("About to create dev dataloader") @@ -393,14 +403,12 @@ class Aishell4AsrDataModule: test = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) if self.args.on_the_fly_feats - else PrecomputedFeatures(), + else eval(self.args.input_strategy)(), return_cuts=self.args.return_cuts, ) sampler = DynamicBucketingSampler( cuts, max_duration=self.args.max_duration, - rank=0, - world_size=1, shuffle=False, ) from lhotse.dataset.iterable_dataset import IterableDatasetWrapper @@ -419,26 +427,22 @@ class Aishell4AsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - if self.args.lazy_load: - logging.info("use lazy cuts") - cuts_train = CutSet.from_jsonl_lazy( - self.args.manifest_dir - / f"cuts_train_{self.args.training_subset}.json.gz" - ) - else: - cuts_train = CutSet.from_file( - self.args.manifest_dir - / f"cuts_train_{self.args.training_subset}.json.gz" - ) - return cuts_train + return load_manifest_lazy( + self.args.manifest_dir + / "aishell4_cuts_train_{self.args.training_subset}.jsonl.gz" + ) @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") # Aishell4 doesn't have dev data, here use test to replace dev. - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "aishell4_cuts_test.jsonl.gz" + ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "aishell4_cuts_test.jsonl.gz" + )