diff --git a/egs/librispeech/ASR/prepare_multidataset.sh b/egs/librispeech/ASR/prepare_multidataset.sh index c068305c0..8b13a5bd8 100755 --- a/egs/librispeech/ASR/prepare_multidataset.sh +++ b/egs/librispeech/ASR/prepare_multidataset.sh @@ -328,46 +328,3 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then ./prepare_common_voice.sh fi fi - -if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then - log "Stage 11: Create multidataset" - split_dir=data/fbank/multidataset_split_${num_splits} - if [ ! -f data/fbank/multidataset_split/.multidataset.done ]; then - mkdir -p $split_dir/multidataset - log "Split LibriSpeech" - if [ ! -f $split_dir/.librispeech_split.done ]; then - lhotse split $num_splits ./data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz $split_dir - touch $split_dir/.librispeech_split.done - fi - - if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then - log "Split GigaSpeech XL" - if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then - cd $split_dir - ln -sv ../gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz . - cd ../../.. - touch $split_dir/.gigaspeech_XL_split.done - fi - fi - - if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then - log "Split CommonVoice" - if [ ! -f $split_dir/.cv-en_train_split.done ]; then - lhotse split $num_splits ./data/en/fbank/cv-en_cuts_train.jsonl.gz $split_dir - touch $split_dir/.cv-en_train_split.done - fi - fi - - if [ ! -f $split_dir/.multidataset_mix.done ]; then - log "Mix multidataset" - for ((seq=1; seq<=$num_splits; seq++)); do - fseq=$(printf "%04d" $seq) - gunzip -c $split_dir/*.*${fseq}.jsonl.gz | \ - shuf | gzip -c > $split_dir/multidataset/multidataset_cuts_train.${fseq}.jsonl.gz - done - touch $split_dir/.multidataset_mix.done - fi - - touch data/fbank/multidataset_split/.multidataset.done - fi -fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py index dcb4cd141..07c7126fa 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py @@ -25,29 +25,53 @@ from lhotse import CutSet, load_manifest_lazy class MultiDataset: - def __init__(self, manifest_dir: str): + def __init__(self, manifest_dir: str, cv_manifest_dir: str): """ Args: manifest_dir: It is expected to contain the following files: - - multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz + - librispeech_cuts_train-all-shuf.jsonl.gz + - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz + + cv_manifest_dir: + It is expected to contain the following files: + + - cv-en_cuts_train.jsonl.gz """ self.manifest_dir = Path(manifest_dir) + self.cv_manifest_dir = Path(cv_manifest_dir) def train_cuts(self) -> CutSet: logging.info("About to get multidataset train cuts") - filenames = glob.glob( - f"{self.manifest_dir}/multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz" + # LibriSpeech + logging.info(f"Loading LibriSpeech in lazy mode") + librispeech_cuts = load_manifest_lazy( + self.manifest_dir / "librispeech_cuts_train-all-shuf.jsonl.gz" ) - pattern = re.compile(r"multidataset_cuts_train.([0-9]+).jsonl.gz") + # GigaSpeech + filenames = glob.glob( + f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz" + ) + + pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz") idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames) idx_filenames = sorted(idx_filenames, key=lambda x: x[0]) sorted_filenames = [f[1] for f in idx_filenames] - logging.info(f"Loading {len(sorted_filenames)} splits") + logging.info(f"Loading GigaSpeech {len(sorted_filenames)} splits in lazy mode") - return lhotse.combine(lhotse.load_manifest_lazy(p) for p in sorted_filenames) + gigaspeech_cuts = lhotse.combine( + lhotse.load_manifest_lazy(p) for p in sorted_filenames + ) + + # CommonVoice + logging.info(f"Loading CommonVoice in lazy mode") + commonvoice_cuts = load_manifest_lazy( + self.cv_manifest_dir / f"cv-en_cuts_train.jsonl.gz" + ) + + return CutSet.mux(librispeech_cuts, gigaspeech_cuts, commonvoice_cuts) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index 01c9500ce..1b179ceff 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -1053,7 +1053,7 @@ def run(rank, world_size, args): librispeech = LibriSpeechAsrDataModule(args) if params.use_multidataset: - multidataset = MultiDataset(params.manifest_dir) + multidataset = MultiDataset(params.manifest_dir, params.cv_manifest_dir) train_cuts = multidataset.train_cuts() else: if params.full_libri: