From 82bd37cacde79ede43e07ac2ac6cd6687b772cc5 Mon Sep 17 00:00:00 2001 From: Bailey Hirota Date: Fri, 2 May 2025 03:31:55 +0900 Subject: [PATCH] add fbank --- .../ASR/local/compute_fbank_mls_english.py | 167 ++++++++---------- egs/mls_english/ASR/prepare.sh | 100 +++++------ .../ASR/musan-k2-v2-reazonspeech-medium | 1 + 3 files changed, 117 insertions(+), 151 deletions(-) create mode 160000 egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium diff --git a/egs/mls_english/ASR/local/compute_fbank_mls_english.py b/egs/mls_english/ASR/local/compute_fbank_mls_english.py index e9bd81551..3826fa376 100644 --- a/egs/mls_english/ASR/local/compute_fbank_mls_english.py +++ b/egs/mls_english/ASR/local/compute_fbank_mls_english.py @@ -1,20 +1,8 @@ #!/usr/bin/env python3 -# Copyright 2023 The University of Electro-Communications (Author: Teo Wen Shen) # noqa +# Copyright 2023 The University of Electro-Communications +# (Author: Teo Wen Shen) # -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# Apache-2.0 import argparse import logging @@ -23,119 +11,106 @@ from pathlib import Path from typing import List, Tuple import torch - -# fmt: off -from lhotse import ( # See the following for why LilcomChunkyWriter is preferred; https://github.com/k2-fsa/icefall/pull/404; https://github.com/lhotse-speech/lhotse/pull/527 - CutSet, - Fbank, - FbankConfig, - LilcomChunkyWriter, - RecordingSet, - SupervisionSet, -) +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.utils import is_module_available -# fmt: on - -# Torch's multithreaded behavior needs to be disabled or -# it wastes a lot of CPU and slow things down. -# Do this outside of main() in case it needs to take effect -# even when we are not invoking the main (e.g. when spawning subprocesses). +# Disable PyTorch intra/inter op threading overhead torch.set_num_threads(1) torch.set_num_interop_threads(1) -RNG_SEED = 42 -concat_params = {"gap": 1.0, "maxlen": 10.0} - def make_cutset_blueprints( mls_eng_hf_dataset_path: str = "parler-tts/mls_eng", ) -> List[Tuple[str, CutSet]]: - cut_sets = [] - if not is_module_available("datasets"): raise ImportError( - "To process the MLS English HF corpus, please install optional dependency: pip install datasets" + "To process the MLS English HF corpus, please install datasets: pip install datasets" ) - from datasets import load_dataset - print(f"{mls_eng_hf_dataset_path=}") dataset = load_dataset(str(mls_eng_hf_dataset_path)) - # Create test dataset - logging.info("Creating test cuts.") - cut_sets.append( - ( - "test", - CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript"), - ) - ) - - # Create dev dataset - logging.info("Creating dev cuts.") - cut_sets.append( - ("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript")) - ) - - # Create train dataset - logging.info("Creating train cuts.") - cut_sets.append( - ( - "train", - CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript"), - ) - ) - return cut_sets + return [ + ("test", CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript")), + ("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript")), + ("train", CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript")), + ] def get_args(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, + p = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument("-m", "--manifest-dir", type=Path) - parser.add_argument("-a", "--audio-dir", type=Path) - parser.add_argument("-d", "--dl-dir", type=Path) - return parser.parse_args() + p.add_argument("-m", "--manifest-dir", + type=Path, + default=Path("data/manifests"), + help="Where to write JSONL cuts") + p.add_argument("-a", "--audio-dir", + type=Path, + default=Path("data/audio"), + help="Where to copy raw audio") + p.add_argument("-d", "--dl-dir", + type=Path, + required=True, + help="Where the HF dataset was cloned") + p.add_argument("--fbank-dir", + type=Path, + default=Path("data/fbank"), + help="Where to write FBANK features") + return p.parse_args() def main(): args = get_args() + # Make sure our directories exist + for d in (args.manifest_dir, args.audio_dir, args.fbank_dir): + d.mkdir(parents=True, exist_ok=True) + + # If we've already computed FBANK, skip. + done_marker = args.fbank_dir / ".mls_eng-fbank.done" + if done_marker.exists(): + logging.info( + "Found done-marker at %s. Skipping FBANK computation.", + done_marker + ) + return + + # Set up logging + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", + level=logging.INFO, + ) + + # Prepare Lhotse cut blueprints from HF dataset + cut_sets = make_cutset_blueprints(str(args.dl_dir)) + + # Feature extractor extractor = Fbank(FbankConfig(num_mel_bins=80)) num_jobs = min(16, os.cpu_count()) - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + for part, cut_set in cut_sets: + logging.info("===== Processing split: %s =====", part) - logging.basicConfig(format=formatter, level=logging.INFO) - - if (args.manifest_dir / ".mls-eng-fbank.done").exists(): - logging.info( - "Previous fbank computed for MLS English found. " - f"Delete {args.manifest_dir / '.mls-eng-fbank.done'} to allow recomputing fbank." + # 1) compute & store FBANK features into fbank-dir + cut_set = cut_set.compute_and_store_features( + extractor=extractor, + num_jobs=num_jobs, + storage_path=(args.fbank_dir / f"mls_eng_feats_{part}").as_posix(), + storage_type=LilcomChunkyWriter, ) - return - else: - mls_eng_hf_dataset_path = args.dl_dir # "/root/datasets/parler-tts--mls_eng" - cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path) - for part, cut_set in cut_sets: - logging.info(f"Processing {part}") - cut_set = cut_set.save_audios( - num_jobs=num_jobs, - storage_path=(args.audio_dir / part).as_posix(), - ) # makes new cutset that loads audio from paths to actual audio files - - cut_set = cut_set.compute_and_store_features( - extractor=extractor, - num_jobs=num_jobs, - storage_path=(args.manifest_dir / f"feats_{part}").as_posix(), - storage_type=LilcomChunkyWriter, - ) - cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz") + # 2) copy raw audio into audio-dir// + cut_set = cut_set.save_audios(args.audio_dir / part) - logging.info("All fbank computed for MLS English.") - (args.manifest_dir / ".mls-eng-fbank.done").touch() + # 3) write final cuts JSONL into manifest-dir + out_manifest = args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz" + cut_set.to_file(out_manifest) + logging.info("Wrote cuts manifest to %s", out_manifest) + + # Touch the done marker so next runs skip + done_marker.touch() + logging.info("All FBANK computed. Done marker created at %s", done_marker) if __name__ == "__main__": diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index 14ca69dae..2daef5667 100755 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -1,97 +1,87 @@ #!/usr/bin/env bash # Prepare script for MLS English ASR recipe in icefall - -# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - set -eou pipefail stage=-1 stop_stage=100 # Configuration for BPE tokenizer -vocab_sizes=(2000) # You can add more sizes like (500 1000 2000) for comparison +vocab_sizes=(500) # Directory where dataset will be downloaded dl_dir=$PWD/download . shared/parse_options.sh || exit 1 -# All files generated by this script are saved in "data". -mkdir -p data -mkdir -p data/audio # Add this line -mkdir -p data/manifests -mkdir -p data/lang +# All files generated by this script are saved in "data/". +mkdir -p data/manifests data/fbank data/audio data/lang log() { local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${LINENO}:${FUNCNAME[1]}) $*" } log "Starting MLS English data preparation" +# Stage 0: Download corpus if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download MLS English dataset" if [ ! -d $dl_dir/mls_english ]; then - if ! git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_english; then - log "Failed to download MLS English dataset" - exit 1 - fi + git clone https://huggingface.co/datasets/parler-tts/mls_eng \ + $dl_dir/mls_english || { + log "Failed to download MLS English dataset"; exit 1; } fi fi -# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then -# log "Stage 1: Prepare MLS English manifest" -# # We assume that you have downloaded the MLS English corpus -# # to $dl_dir/mls_english -# if [ ! -e data/manifests/.mls_english.done ]; then -# # lhotse prepare mls_english -j $nj $dl_dir/mls_english data/manifests -# python local/utils/save_audios.py --num-jobs 8 --dataset-dir $dl_dir/mls_english --audio-dir ./data/audio --manifest-dir ./data/manifests -# touch data/manifests/.mls_english.done -# fi -# fi - +# Stage 1: Compute fbank & emit manifests if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Compute MLS English fbank" - if [ ! -e data/manifests/.mls_english-validated.done ]; then - python local/compute_fbank_mls_english.py \ - --manifest-dir data/manifests \ - --audio-dir data/audio \ - --dl-dir $dl_dir/mls_english - # --dl-dir /root/datasets/parler-tts--mls_eng - python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_train.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_dev.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_test.jsonl.gz - touch data/manifests/.mls_english-validated.done - fi -fi + log "Stage 1: Compute & validate MLS English fbank" + # we already did `mkdir -p data/manifests data/fbank data/audio` above -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Prepare transcript for BPE training" - if [ ! -f data/lang/transcript.txt ]; then - log "Generating transcripts for BPE training" - python local/utils/generate_transcript.py \ - --dataset-path $dl_dir/mls_english \ - --lang-dir data/lang \ - --split train + if [ ! -e data/fbank/.mls_eng-fbank.done ]; then + python local/compute_fbank_mls_english.py \ + --manifest-dir data/manifests \ + --audio-dir data/audio \ + --dl-dir $dl_dir/mls_english \ + --fbank-dir data/fbank + + # Validate each split’s manifest + for split in train dev test; do + python local/validate_manifest.py \ + --manifest data/manifests/mls_eng_cuts_${split}.jsonl.gz + done + + touch data/fbank/.mls_eng-fbank.done + log "fbank + manifest generation complete." + else + log "Skipping: fbank already done (data/fbank/.mls_eng-fbank.done exists)." fi fi -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Prepare BPE tokenizer" - for vocab_size in ${vocab_sizes[@]}; do - log "Training BPE model with vocab_size=${vocab_size}" - bpe_dir=data/lang/bpe_${vocab_size} - mkdir -p $bpe_dir +# Stage 2: Prepare transcript for BPE +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Generate transcript for BPE" + if [ ! -f data/lang/transcript.txt ]; then + ./local/utils/generate_transcript.py --lang-dir data/lang + fi +fi + +# Stage 3: Train BPE models +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Train BPE models" + for vocab_size in "${vocab_sizes[@]}"; do + bpe_dir=data/lang_bpe_${vocab_size} + mkdir -p $bpe_dir if [ ! -f $bpe_dir/bpe.model ]; then - python local/train_bpe_model.py \ - --lang-dir $bpe_dir \ + ./local/train_bpe_model.py \ + --lang-dir $bpe_dir \ --vocab-size $vocab_size \ --transcript data/lang/transcript.txt fi done fi -log "MLS English data preparation completed successfully" \ No newline at end of file +log "MLS English data preparation completed successfully" diff --git a/egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium b/egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium new file mode 160000 index 000000000..157dc62a6 --- /dev/null +++ b/egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium @@ -0,0 +1 @@ +Subproject commit 157dc62a6439f03de1fcccb4914fc0d30b6f21b8