From 3532edbc36e8dbe2fd608c3d88ae037691592d88 Mon Sep 17 00:00:00 2001 From: yfyeung Date: Tue, 8 Jul 2025 07:38:30 +0000 Subject: [PATCH] refactor gigaspeech data preparation update update update update update fix fix update fix fix fix fix update update --- egs/gigaspeech/ASR/local/compile_lg.py | 1 + .../ASR/local/compute_fbank_gigaspeech.py | 10 +- .../local/compute_fbank_gigaspeech_splits.py | 41 +-- .../convert_transcript_words_to_tokens.py | 1 - .../ASR/local/preprocess_gigaspeech.py | 37 +-- .../ASR/local/validate_bpe_lexicon.py | 1 + egs/gigaspeech/ASR/prepare.sh | 266 ++++++------------ egs/gigaspeech/ASR/prepare_lm.sh | 98 +++++++ .../ASR/zipformer/asr_datamodule.py | 17 +- egs/gigaspeech/ASR/zipformer/train.py | 11 +- egs/libriheavy/ASR/prepare.sh | 1 - 11 files changed, 242 insertions(+), 242 deletions(-) create mode 120000 egs/gigaspeech/ASR/local/compile_lg.py delete mode 120000 egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py create mode 120000 egs/gigaspeech/ASR/local/validate_bpe_lexicon.py create mode 100755 egs/gigaspeech/ASR/prepare_lm.sh diff --git a/egs/gigaspeech/ASR/local/compile_lg.py b/egs/gigaspeech/ASR/local/compile_lg.py new file mode 120000 index 000000000..462d6d3fb --- /dev/null +++ b/egs/gigaspeech/ASR/local/compile_lg.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compile_lg.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py index 9e0df0989..14353008c 100755 --- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py @@ -32,13 +32,21 @@ torch.set_num_interop_threads(1) def compute_fbank_gigaspeech(): in_out_dir = Path("data/fbank") + # number of workers in dataloader num_workers = 20 # number of seconds in a batch batch_duration = 1000 - subsets = ("L", "M", "S", "XS", "DEV", "TEST") + subsets = ( + "DEV", + "TEST", + # "L", + # "M", + # "S", + # "XS", + ) device = torch.device("cpu") if torch.cuda.is_available(): diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py index 51cd59078..12abb9391 100755 --- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -18,7 +18,6 @@ import argparse import logging -from datetime import datetime from pathlib import Path import torch @@ -32,7 +31,7 @@ torch.set_num_threads(1) torch.set_num_interop_threads(1) -def get_parser(): +def get_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) @@ -71,17 +70,15 @@ def get_parser(): default=-1, help="Stop processing pieces until this number (exclusive).", ) - return parser + return parser.parse_args() def compute_fbank_gigaspeech_splits(args): num_splits = args.num_splits - output_dir = f"data/fbank/XL_split" + output_dir = "data/fbank/gigaspeech_XL_split" output_dir = Path(output_dir) assert output_dir.exists(), f"{output_dir} does not exist!" - num_digits = 8 # num_digits is fixed by lhotse split-lazy - start = args.start stop = args.stop if stop < start: @@ -95,6 +92,7 @@ def compute_fbank_gigaspeech_splits(args): extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) logging.info(f"device: {device}") + num_digits = 8 # num_digits is fixed by lhotse split-lazy for i in range(start, stop): idx = f"{i}".zfill(num_digits) logging.info(f"Processing {idx}/{num_splits}") @@ -105,15 +103,21 @@ def compute_fbank_gigaspeech_splits(args): continue raw_cuts_path = output_dir / f"gigaspeech_cuts_XL_raw.{idx}.jsonl.gz" + if not raw_cuts_path.is_file(): + logging.info(f"{raw_cuts_path} does not exist - skipping it") + continue logging.info(f"Loading {raw_cuts_path}") cut_set = CutSet.from_file(raw_cuts_path) logging.info("Computing features") + if (output_dir / f"gigaspeech_feats_XL_{idx}.lca").exists(): + logging.info(f"Removing {output_dir}/gigaspeech_feats_XL_{idx}.lca") + os.remove(output_dir / f"gigaspeech_feats_XL_{idx}.lca") cut_set = cut_set.compute_and_store_features_batch( extractor=extractor, - storage_path=f"{output_dir}/gigaspeech_feats_{idx}", + storage_path=f"{output_dir}/gigaspeech_feats_XL_{idx}", num_workers=args.num_workers, batch_duration=args.batch_duration, overwrite=True, @@ -130,29 +134,10 @@ def compute_fbank_gigaspeech_splits(args): def main(): - now = datetime.now() - date_time = now.strftime("%Y-%m-%d-%H-%M-%S") - - log_filename = "log-compute_fbank_gigaspeech_splits" formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - log_filename = f"{log_filename}-{date_time}" - - logging.basicConfig( - filename=log_filename, - format=formatter, - level=logging.INFO, - filemode="w", - ) - - console = logging.StreamHandler() - console.setLevel(logging.INFO) - console.setFormatter(logging.Formatter(formatter)) - logging.getLogger("").addHandler(console) - - parser = get_parser() - args = parser.parse_args() - logging.info(vars(args)) + logging.basicConfig(format=formatter, level=logging.INFO) + args = get_args() compute_fbank_gigaspeech_splits(args) diff --git a/egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py b/egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py deleted file mode 120000 index 2ce13fd69..000000000 --- a/egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py +++ /dev/null @@ -1 +0,0 @@ -../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py b/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py index a31685211..5bc881ab3 100755 --- a/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py +++ b/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py @@ -30,18 +30,6 @@ from icefall.utils import str2bool # https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--perturb-speed", - type=str2bool, - default=False, - help="Whether to use speed perturbation.", - ) - - return parser.parse_args() - - def normalize_text( utt: str, punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"), @@ -57,7 +45,7 @@ def has_no_oov( return oov_pattern.search(sup.text) is None -def preprocess_giga_speech(args): +def preprocess_gigaspeech(): src_dir = Path("data/manifests") output_dir = Path("data/fbank") output_dir.mkdir(exist_ok=True) @@ -66,10 +54,10 @@ def preprocess_giga_speech(args): "DEV", "TEST", "XL", - "L", - "M", - "S", - "XS", + # "L", + # "M", + # "S", + # "XS", ) logging.info("Loading manifest (may take 4 minutes)") @@ -110,17 +98,7 @@ def preprocess_giga_speech(args): recordings=m["recordings"], supervisions=m["supervisions"], ) - # Run data augmentation that needs to be done in the - # time domain. - if partition not in ["DEV", "TEST"]: - if args.perturb_speed: - logging.info( - f"Speed perturb for {partition} with factors 0.9 and 1.1 " - "(Perturbing may take 8 minutes and saving may take 20 minutes)" - ) - cut_set = ( - cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) - ) + logging.info(f"Saving to {raw_cuts_path}") cut_set.to_file(raw_cuts_path) @@ -129,8 +107,7 @@ def main(): formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" logging.basicConfig(format=formatter, level=logging.INFO) - args = get_args() - preprocess_giga_speech(args) + preprocess_gigaspeech() if __name__ == "__main__": diff --git a/egs/gigaspeech/ASR/local/validate_bpe_lexicon.py b/egs/gigaspeech/ASR/local/validate_bpe_lexicon.py new file mode 120000 index 000000000..721bb48e7 --- /dev/null +++ b/egs/gigaspeech/ASR/local/validate_bpe_lexicon.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/validate_bpe_lexicon.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh index 219197e13..f1c0be692 100755 --- a/egs/gigaspeech/ASR/prepare.sh +++ b/egs/gigaspeech/ASR/prepare.sh @@ -6,12 +6,24 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=15 -stage=0 -stop_stage=100 -# Split XL subset to a number of pieces (about 2000) -# This is to avoid OOM during feature extraction. -num_per_split=50 +# Run step 0 to step 8 by default +stage=0 +stop_stage=8 + +# Compute fbank features for a subset of splits from `start` (inclusive) to `stop` (exclusive) +start=0 +stop=-1 + +# Note: This script just prepare the minimal requirements that needed by a +# transducer training with bpe units. +# +# If you want to use ngram, please continue running prepare_lm.sh after +# you succeed running this script. +# +# This script also contains the steps to generate phone based units, but they +# will not run automatically, you can generate the phone based units by +# bash prepare.sh --stage 9 --stop-stage 9 # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded @@ -34,9 +46,10 @@ num_per_split=50 # This directory contains the following directories downloaded from # http://www.openslr.org/17/ # -# - music -# - noise -# - speech +# - music +# - noise +# - speech + dl_dir=$PWD/download . shared/parse_options.sh || exit 1 @@ -45,6 +58,9 @@ dl_dir=$PWD/download # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( + # 5000 + # 2000 + # 1000 500 ) @@ -58,10 +74,12 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } +log "Running prepare.sh" + log "dl_dir: $dl_dir" if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then - log "stage -1: Download LM" + log "Stage -1: Download LM" # We assume that you have installed the git-lfs, if not, you could install it # using: `sudo apt-get install git-lfs && git-lfs install` [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm @@ -78,7 +96,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then # If you have pre-downloaded it to /path/to/GigaSpeech, # you can create a symlink # - # ln -sfv /path/to/GigaSpeech $dl_dir/GigaSpeech + # ln -svf /path/to/GigaSpeech $dl_dir/GigaSpeech # if [ ! -d $dl_dir/GigaSpeech/audio ] && [ ! -f $dl_dir/GigaSpeech.json ]; then # Check credentials. @@ -88,32 +106,37 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then echo " and save it to $dl_dir/password." exit 1; fi + PASSWORD=`cat $dl_dir/password 2>/dev/null` if [ -z "$PASSWORD" ]; then echo "$0: Error, $dl_dir/password is empty." exit 1; fi + PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1` if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then echo "$0: Error, invalid $dl_dir/password." exit 1; fi + # Download XL, DEV and TEST sets by default. - lhotse download gigaspeech --subset XL \ - --subset L \ - --subset M \ - --subset S \ - --subset XS \ + # Support hosts: + # 1. oss + # 2. tsinghua + # 3. speechocean + # 4. magicdata + lhotse download gigaspeech \ + --host magicdata \ --subset DEV \ --subset TEST \ - --host tsinghua \ + --subset XL \ $dl_dir/password $dl_dir/GigaSpeech fi # If you have pre-downloaded it to /path/to/musan, # you can create a symlink # - # ln -sfv /path/to/musan $dl_dir/ + # ln -svf /path/to/musan $dl_dir/ # if [ ! -d $dl_dir/musan ]; then lhotse download musan $dl_dir @@ -125,11 +148,8 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then # We assume that you have downloaded the GigaSpeech corpus # to $dl_dir/GigaSpeech mkdir -p data/manifests - lhotse prepare gigaspeech --subset XL \ - --subset L \ - --subset M \ - --subset S \ - --subset XS \ + lhotse prepare gigaspeech \ + --subset XL \ --subset DEV \ --subset TEST \ -j $nj \ @@ -147,19 +167,20 @@ fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "State 3: Preprocess GigaSpeech manifest" if [ ! -f data/fbank/.preprocess_complete ]; then - python3 ./local/preprocess_gigaspeech.py - touch data/fbank/.preprocess_complete + python3 ./local/preprocess_gigaspeech.py + touch data/fbank/.preprocess_complete fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Compute features for L, M, S, XS, DEV and TEST subsets of GigaSpeech." + log "Stage 4: Compute features for DEV, TEST, L, M, S, and XS subsets of GigaSpeech." python3 ./local/compute_fbank_gigaspeech.py fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Split XL subset into pieces (may take 30 minutes)" - split_dir=data/fbank/XL_split + log "Stage 5: Split XL subset into pieces (may take 5 minutes)" + num_per_split=50 + split_dir=data/fbank/gigaspeech_XL_split if [ ! -f $split_dir/.split_completed ]; then lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $num_per_split touch $split_dir/.split_completed @@ -168,82 +189,63 @@ fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "Stage 6: Compute features for XL" - num_splits=$(find data/fbank/XL_split -name "gigaspeech_cuts_XL_raw.*.jsonl.gz" | wc -l) + split_dir=data/fbank/gigaspeech_XL_split + num_splits=$(find $split_dir -name "gigaspeech_cuts_XL_raw.*.jsonl.gz" | wc -l) python3 ./local/compute_fbank_gigaspeech_splits.py \ --num-workers 20 \ --batch-duration 600 \ - --num-splits $num_splits + --num-splits $num_splits \ + --start $start \ + --stop $stop fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 7: Combine features for XL (may take 3 hours)" - if [ ! -f data/fbank/gigaspeech_cuts_XL.jsonl.gz ]; then - pieces=$(find data/fbank/XL_split -name "gigaspeech_cuts_XL.*.jsonl.gz") - lhotse combine $pieces data/fbank/gigaspeech_cuts_XL.jsonl.gz - fi -fi - -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Compute fbank for musan" + log "Stage 7: Compute fbank for musan" mkdir -p data/fbank ./local/compute_fbank_musan.py fi -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Prepare transcript_words.txt and words.txt" - lang_dir=data/lang_phone - mkdir -p $lang_dir - if [ ! -f $lang_dir/transcript_words.txt ]; then - gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \ - | jq '.text' \ - | sed 's/"//g' \ - > $lang_dir/transcript_words.txt +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Prepare BPE based lang" + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir - # Delete utterances with garbage meta tags - garbage_utterance_tags=" " - for tag in $garbage_utterance_tags; do - sed -i "/${tag}/d" $lang_dir/transcript_words.txt - done + if [ ! -f $lang_dir/transcript_words.txt ]; then + log "Generate data for BPE training" + gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \ + | jq '.text' \ + | sed 's/"//g' \ + > $lang_dir/transcript_words.txt - # Delete punctuations in utterances - punctuation_tags=" " - for tag in $punctuation_tags; do - sed -i "s/${tag}//g" $lang_dir/transcript_words.txt - done + # Delete utterances with garbage meta tags + garbage_utterance_tags=" " + for tag in $garbage_utterance_tags; do + sed -i "/${tag}/d" $lang_dir/transcript_words.txt + done - # Ensure space only appears once - sed -i 's/\t/ /g' $lang_dir/transcript_words.txt - sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt - fi + # Delete punctuations in utterances + punctuation_tags=" " + for tag in $punctuation_tags; do + sed -i "s/${tag}//g" $lang_dir/transcript_words.txt + done - cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \ - | sort -u | sed '/^$/d' > $lang_dir/words.txt - (echo '!SIL'; echo ''; echo ''; ) | - cat - $lang_dir/words.txt | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; - } - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; - } - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $lang_dir/words || exit 1; - mv $lang_dir/words $lang_dir/words.txt + # Ensure space only appears once + sed -i 's/\t/ /g' $lang_dir/transcript_words.txt + sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_words.txt + fi + done fi -if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then - log "Stage 10: Prepare phone based lang" +if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then + log "Stage 9: Prepare phone based lang" lang_dir=data/lang_phone mkdir -p $lang_dir @@ -255,93 +257,3 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then ./local/prepare_lang.py --lang-dir $lang_dir fi fi - -if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then - log "Stage 11: Prepare BPE based lang" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - mkdir -p $lang_dir - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. - cp data/lang_phone/{words.txt,transcript_words.txt} $lang_dir - - if [ ! -f $lang_dir/bpe.model ]; then - ./local/train_bpe_model.py \ - --lang-dir $lang_dir \ - --vocab-size $vocab_size \ - --transcript $lang_dir/transcript_words.txt - fi - - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang_bpe.py --lang-dir $lang_dir - fi - done -fi - -if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then - log "Stage 12: Prepare bigram P" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - - if [ ! -f $lang_dir/transcript_tokens.txt ]; then - ./local/convert_transcript_words_to_tokens.py \ - --lexicon $lang_dir/lexicon.txt \ - --transcript $lang_dir/transcript_words.txt \ - --oov "" \ - > $lang_dir/transcript_tokens.txt - fi - - if [ ! -f $lang_dir/P.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 2 \ - -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/P.arpa - fi - - if [ ! -f $lang_dir/P.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_dir/tokens.txt" \ - --disambig-symbol='#0' \ - --max-order=2 \ - $lang_dir/P.arpa > $lang_dir/P.fst.txt - fi - done -fi - -if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then - log "Stage 13: Prepare G" - # We assume you have installed kaldilm, if not, please install - # it using: pip install kaldilm - - mkdir -p data/lm - - if [ ! -f data/lm/G_3_gram.fst.txt ]; then - # It is used in building HLG - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $dl_dir/lm/3gram_pruned_1e7.arpa > data/lm/G_3_gram.fst.txt - fi - - if [ ! -f data/lm/G_4_gram.fst.txt ]; then - # It is used for LM rescoring - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=4 \ - $dl_dir/lm/4gram.arpa > data/lm/G_4_gram.fst.txt - fi -fi - -if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then - log "Stage 14: Compile HLG" - ./local/compile_hlg.py --lang-dir data/lang_phone - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - ./local/compile_hlg.py --lang-dir $lang_dir - done -fi diff --git a/egs/gigaspeech/ASR/prepare_lm.sh b/egs/gigaspeech/ASR/prepare_lm.sh new file mode 100755 index 000000000..a6954a4de --- /dev/null +++ b/egs/gigaspeech/ASR/prepare_lm.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +# This script generate Ngram LM and related files that needed by decoding. + +# We assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# +# - $dl_dir/lm +# This directory contains the language model downloaded from +# https://huggingface.co/wgb14/gigaspeech_lm +# +# - 3gram_pruned_1e7.arpa.gz +# - 4gram.arpa.gz +# - lexicon.txt + +. prepare.sh --stage -1 --stop-stage 9 || exit 1 + +stage=0 +stop_stage=100 + +. shared/parse_options.sh || exit 1 + +log "Running prepare_lm.sh" + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare BPE based lexicon" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang_phone/words.txt $lang_dir + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + + log "Validating $lang_dir/lexicon.txt" + ./local/validate_bpe_lexicon.py \ + --lexicon $lang_dir/lexicon.txt \ + --bpe-model $lang_dir/bpe.model + fi + done +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare word-level G" + # We assume you have installed kaldilm, if not, please install + # it using: pip install kaldilm + + mkdir -p data/lm + + if [ ! -f data/lm/G_3_gram.fst.txt ]; then + # It is used in building HLG + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + $dl_dir/lm/3gram_pruned_1e7.arpa > data/lm/G_3_gram.fst.txt + fi + + if [ ! -f data/lm/G_4_gram.fst.txt ]; then + # It is used for LM rescoring + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + $dl_dir/lm/4gram.arpa > data/lm/G_4_gram.fst.txt + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compile HLG" + ./local/compile_hlg.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_hlg.py --lang-dir $lang_dir + done +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compile LG" + # It is used for for RNN-T fast_beam_search decoding + ./local/compile_lg.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_lg.py --lang-dir $lang_dir + done +fi diff --git a/egs/gigaspeech/ASR/zipformer/asr_datamodule.py b/egs/gigaspeech/ASR/zipformer/asr_datamodule.py index 892817b55..93a41b27a 100644 --- a/egs/gigaspeech/ASR/zipformer/asr_datamodule.py +++ b/egs/gigaspeech/ASR/zipformer/asr_datamodule.py @@ -219,6 +219,8 @@ class GigaSpeechAsrDataModule: self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None, + world_size: Optional[int] = None, + rank: Optional[int] = None, ) -> DataLoader: """ Args: @@ -313,6 +315,8 @@ class GigaSpeechAsrDataModule: num_buckets=self.args.num_buckets, buffer_size=self.args.num_buckets * 5000, drop_last=self.args.drop_last, + world_size=world_size, + rank=rank, ) else: logging.info("Using SimpleCutSampler.") @@ -320,6 +324,8 @@ class GigaSpeechAsrDataModule: cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, + world_size=world_size, + rank=rank, ) logging.info("About to create train dataloader") @@ -343,7 +349,12 @@ class GigaSpeechAsrDataModule: return train_dl - def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader: + def valid_dataloaders( + self, + cuts_valid: CutSet, + world_size: Optional[int] = None, + rank: Optional[int] = None, + ) -> DataLoader: transforms = [] if self.args.concatenate_cuts: transforms = [ @@ -370,6 +381,8 @@ class GigaSpeechAsrDataModule: num_buckets=self.args.num_buckets, buffer_size=self.args.num_buckets * 5000, shuffle=False, + world_size=world_size, + rank=rank, ) logging.info("About to create dev dataloader") valid_dl = DataLoader( @@ -409,7 +422,7 @@ class GigaSpeechAsrDataModule: logging.info(f"About to get train {self.args.subset} cuts") if self.args.subset == "XL": filenames = glob.glob( - f"{self.args.manifest_dir}/XL_split/gigaspeech_cuts_XL.*.jsonl.gz" + f"{self.args.manifest_dir}/gigaspeech_XL_split/gigaspeech_cuts_XL.*.jsonl.gz" ) pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz") idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames) diff --git a/egs/gigaspeech/ASR/zipformer/train.py b/egs/gigaspeech/ASR/zipformer/train.py index 8cf8f9fc7..d586fc26a 100755 --- a/egs/gigaspeech/ASR/zipformer/train.py +++ b/egs/gigaspeech/ASR/zipformer/train.py @@ -1202,12 +1202,19 @@ def run(rank, world_size, args): sampler_state_dict = None train_dl = gigaspeech.train_dataloaders( - train_cuts, sampler_state_dict=sampler_state_dict + train_cuts, + sampler_state_dict=sampler_state_dict, + world_size=world_size, + rank=rank, ) valid_cuts = gigaspeech.dev_cuts() valid_cuts = valid_cuts.filter(remove_short_utt) - valid_dl = gigaspeech.valid_dataloaders(valid_cuts) + valid_dl = gigaspeech.valid_dataloaders( + valid_cuts, + world_size=world_size, + rank=rank, + ) if not params.print_diagnostics and params.scan_for_oom_batches: scan_pessimistic_batches_for_oom( diff --git a/egs/libriheavy/ASR/prepare.sh b/egs/libriheavy/ASR/prepare.sh index 366a1459f..110d7b7ba 100755 --- a/egs/libriheavy/ASR/prepare.sh +++ b/egs/libriheavy/ASR/prepare.sh @@ -245,7 +245,6 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then done fi - if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then log "Stage 10: Train BPE model for unnormalized text" if [ ! -f data/punc_texts ]; then