From ef2b95cb299eb294de8d14dde79692232f99d183 Mon Sep 17 00:00:00 2001 From: marcoyang Date: Wed, 28 Feb 2024 12:10:37 +0800 Subject: [PATCH] data preparation for MLS --- egs/mls/ASR/local/compute_fbank_mls_splits.py | 8 +- egs/mls/ASR/prepare.sh | 207 +++++++++--------- 2 files changed, 103 insertions(+), 112 deletions(-) diff --git a/egs/mls/ASR/local/compute_fbank_mls_splits.py b/egs/mls/ASR/local/compute_fbank_mls_splits.py index ed9e7492c..2160bfb89 100755 --- a/egs/mls/ASR/local/compute_fbank_mls_splits.py +++ b/egs/mls/ASR/local/compute_fbank_mls_splits.py @@ -61,7 +61,7 @@ def get_parser(): "--num-splits", type=int, required=True, - help="The number of splits of the XL subset", + help="The number of splits of the English subset", ) parser.add_argument( @@ -111,12 +111,12 @@ def compute_fbank_mls_splits(args): idx = f"{i}".zfill(num_digits) logging.info(f"Processing {idx}/{num_splits}") - cuts_path = output_dir / f"cuts_{args.subset}.{idx}.jsonl.gz" + cuts_path = output_dir / f"mls-{args.language}_train.{idx}.jsonl.gz" if cuts_path.is_file(): logging.info(f"{cuts_path} exists - skipping") continue - raw_cuts_path = output_dir / f"cuts_{args.subset}_raw.{idx}.jsonl.gz" + raw_cuts_path = output_dir / f"mls-{args.language}_train_raw.{idx}.jsonl.gz" logging.info(f"Loading {raw_cuts_path}") cut_set = CutSet.from_file(raw_cuts_path) @@ -125,7 +125,7 @@ def compute_fbank_mls_splits(args): cut_set = cut_set.compute_and_store_features_batch( extractor=extractor, - storage_path=f"{output_dir}/feats_{args.subset}_{idx}", + storage_path=f"{output_dir}/feats_{args.language}_{idx}", num_workers=args.num_workers, batch_duration=args.batch_duration, overwrite=True, diff --git a/egs/mls/ASR/prepare.sh b/egs/mls/ASR/prepare.sh index 48f1e2a70..5987aca69 100755 --- a/egs/mls/ASR/prepare.sh +++ b/egs/mls/ASR/prepare.sh @@ -37,23 +37,9 @@ stop_stage=5 # - music # - noise # - speech -# -# lm directory is not necessary for transducer training with bpe units, but it -# is needed by phone based modeling, you can download it by running -# bash prepare.sh --stage -1 --stop-stage -1 -# then you can see the following files in the directory. -# - $dl_dir/lm -# This directory contains the following files downloaded from -# http://www.openslr.org/resources/11 -# -# - 3-gram.pruned.1e-7.arpa.gz -# - 3-gram.pruned.1e-7.arpa -# - 4-gram.arpa.gz -# - 4-gram.arpa -# - librispeech-vocab.txt -# - librispeech-lexicon.txt -# - librispeech-lm-norm.txt.gz +num_per_split=4000 +fbank_dir=data/fbank_mls dl_dir=$PWD/download . shared/parse_options.sh || exit 1 @@ -63,9 +49,9 @@ dl_dir=$PWD/download # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( # 5000 - # 2000 - # 1000 - 500 + 2000 + 1000 + # 500 ) # All files generated by this script are saved in "data". @@ -81,17 +67,29 @@ log() { log "Running prepare.sh" log "dl_dir: $dl_dir" +log "fbank_dir: $fbank_dir" + +languages=( + english + german + dutch + spanish + italian + french + polish + portuguese +) if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" - # If you have pre-downloaded it to /path/to/LibriSpeech, + # If you have pre-downloaded it to /path/to/MLS, # you can create a symlink # - # ln -sfv /path/to/LibriSpeech $dl_dir/LibriSpeech + # ln -sfv /path/to/MLS $dl_dir/MLS # - if [ ! -d $dl_dir/LibriSpeech/train-other-500 ]; then - lhotse download librispeech --full $dl_dir + if [ ! -d $dl_dir/MLS/train-other-500 ]; then + lhotse download mls --full $dl_dir fi # If you have pre-downloaded it to /path/to/musan, @@ -105,13 +103,13 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare LibriSpeech manifest" - # We assume that you have downloaded the LibriSpeech corpus - # to $dl_dir/LibriSpeech + log "Stage 1: Prepare MLS manifest" + # We assume that you have downloaded the MLS corpus + # to $dl_dir/MLS mkdir -p data/manifests - if [ ! -e data/manifests/.librispeech.done ]; then - lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests - touch data/manifests/.librispeech.done + if [ ! -e data/manifests/.mls.done ]; then + lhotse prepare mls -j $nj $dl_dir/MLS data/manifests + touch data/manifests/.mls.done fi fi @@ -127,50 +125,73 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for librispeech" - mkdir -p data/fbank - if [ ! -e data/fbank/.librispeech.done ]; then - ./local/compute_fbank_librispeech.py - touch data/fbank/.librispeech.done - fi - - if [ ! -f data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz ]; then - cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \ - <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \ - shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz - fi - - if [ ! -e data/fbank/.librispeech-validated.done ]; then - log "Validating data/fbank for LibriSpeech" - parts=( - train-clean-100 - train-clean-360 - train-other-500 - test-clean - test-other - dev-clean - dev-other - ) - for part in ${parts[@]}; do - python3 ./local/validate_manifest.py \ - data/fbank/librispeech_cuts_${part}.jsonl.gz - done - touch data/fbank/.librispeech-validated.done + log "Stage 3: Split english subset into pieces (may take 30 minutes)" + split_dir=${fbank_dir}/english_split + if [ ! -f $split_dir/.split_completed ]; then + lhotse split-lazy ${fbank_dir}/mls-english_train_raw.jsonl.gz $split_dir $num_per_split + touch $split_dir/.split_completed fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Compute fbank for musan" - mkdir -p data/fbank - if [ ! -e data/fbank/.musan.done ]; then - ./local/compute_fbank_musan.py - touch data/fbank/.musan.done + log "Stage 4: Compute fbank for MLS (except English)" + mkdir -p ${fbank_dir} + if [ ! -e ${fbank_dir}/.mls.done ]; then + ./local/compute_fbank_mls.py + touch ${fbank_dir}/.mls.done fi fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Prepare BPE based lang" +log "Stage 5: Compute fbank for English split of MLS" + if [ ! -e ${fbank_dir}/.mls-english.done ]; then + num_splits=$(find ${fbank_dir}/english_split -name "mls-english_train_raw.*.jsonl.gz" | wc -l) + ./local/compute_fbank_mls_splits.py \ + --fbank-dir $fbank_dir \ + --num-workers 20 \ + --language english \ + --num-splits $num_splits \ + + touch ${fbank_dir}/.mls-english.done + fi + + if [ ! -e ${fbank_dir}/mls-english_train.jsonl.gz ]; then + pieces=$(find ${fbank_dir}/english_split -name "mls-english_train.*.jsonl.gz") + lhotse combine $pieces ${fbank_dir}/mls-english_train.jsonl.gz + fi + +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Validate the manifest of MLS" + if [ ! -e ${fbank_dir}/.mls-validated.done ]; then + log "Validating the fbank features for MLS" + parts=( + train + dev + test + ) + for lan in ${languages[@]}; do + for part in ${parts[@]}; do + python3 ./local/validate_manifest.py \ + ${fbank_dir}/mls-${lan}_${part}.jsonl.gz + done + done + touch ${fbank_dir}/.mls-validated.done + fi +fi + +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Compute fbank for musan" + mkdir -p ${fbank_dir} + if [ ! -e ${fbank_dir}/.musan.done ]; then + ./local/compute_fbank_musan.py + touch ${fbank_dir}/.musan.done + fi +fi + +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Prepare BPE based lang" for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size} @@ -178,13 +199,18 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then log "Generate data for BPE training" - files=$( - find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" + files=( + "$dl_dir/MLS/mls_english/train/transcripts.txt" + "$dl_dir/MLS/mls_german/train/transcripts.txt" + "$dl_dir/MLS/mls_dutch/train/transcripts.txt" + "$dl_dir/MLS/mls_french/train/transcripts.txt" + "$dl_dir/MLS/mls_spanish/train/transcripts.txt" + "$dl_dir/MLS/mls_italian/train/transcripts.txt" + "$dl_dir/MLS/mls_portuguese/train/transcripts.txt" + "$dl_dir/MLS/mls_polish/train/transcripts.txt" ) for f in ${files[@]}; do - cat $f | cut -d " " -f 2- + head -n 1000000 $f | cut -d " " -f 2- done > $lang_dir/transcript_words.txt fi @@ -192,45 +218,10 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then ./local/train_bpe_model.py \ --lang-dir $lang_dir \ --vocab-size $vocab_size \ - --transcript $lang_dir/transcript_words.txt + --character-coverage 0.999 \ + --transcript $lang_dir/transcript_words.txt \ + --byte-fallback fi done fi -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare phone based lang" - lang_dir=data/lang_phone - mkdir -p $lang_dir - - if [ ! -f $dl_dir/lm/librispeech-lexicon.txt ]; then - log "No lexicon file in $dl_dir/lm, please run :" - log "prepare.sh --stage -1 --stop-stage -1" - exit -1 - fi - - if [ ! -f $lang_dir/lexicon.txt ]; then - (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | - cat - $dl_dir/lm/librispeech-lexicon.txt | - sort | uniq > $lang_dir/lexicon.txt - fi - - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang.py --lang-dir $lang_dir - fi - - if [ ! -f $lang_dir/L.fst ]; then - log "Converting L.pt to L.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L.pt \ - $lang_dir/L.fst - fi - - if [ ! -f $lang_dir/L_disambig.fst ]; then - log "Converting L_disambig.pt to L_disambig.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L_disambig.pt \ - $lang_dir/L_disambig.fst - fi -fi