From 21d1bf73bb9d1904a17cad32479493ac8d85ac6d Mon Sep 17 00:00:00 2001 From: Kinan Martin Date: Fri, 9 May 2025 10:57:41 +0900 Subject: [PATCH] new version of multi_ja_en prepare.sh script which swaps Librispeech for MLS English --- egs/multi_ja_en/ASR/prepare.sh | 84 ++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/egs/multi_ja_en/ASR/prepare.sh b/egs/multi_ja_en/ASR/prepare.sh index 7a6a63418..9d21f54c9 100755 --- a/egs/multi_ja_en/ASR/prepare.sh +++ b/egs/multi_ja_en/ASR/prepare.sh @@ -19,6 +19,8 @@ vocab_sizes=( # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data +mkdir -p data/lang +mkdir -p data/manifests log() { # This function is from espnet @@ -31,55 +33,54 @@ log "dl_dir: $dl_dir" log "Dataset: musan" if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Soft link fbank of musan" - mkdir -p data/fbank if [ -e ../../librispeech/ASR/data/fbank/.musan.done ]; then - cd data/fbank - ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_feats) . - ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) . - cd ../.. + cd data/manifests + mkdir -p musan + cd musan + ln -svf $(realpath ../../../../../librispeech/ASR/data/fbank/musan_feats) . + ln -svf $(realpath ../../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) . + cd ../../.. else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 4 --stop-stage 4" exit 1 fi fi -log "Dataset: LibriSpeech" +log "Dataset: MLS English" if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 1: Soft link fbank of LibriSpeech" - mkdir -p data/fbank - if [ -e ../../librispeech/ASR/data/fbank/.librispeech.done ]; then - cd data/fbank - ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts*) . - ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats*) . - cd ../.. + log "Stage 2: Soft link manifests (including fbank) of MLS English" + if [ -e ../../mls_english/ASR/data/manifests/.mls_english-validated.done ]; then + cd data/manifests + mkdir -p mls_english + cd mls_english + ln -svf $(realpath ../../../../../mls_english/ASR/data/manifests/mls_eng_cuts*) . + ln -svf $(realpath ../../../../../mls_english/ASR/data/manifests/feats*) . + cd ../../.. else - log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 1 --stop-stage 1 and ../../librispeech/ASR/prepare.sh --stage 3 --stop-stage 3" + log "Abort! Please run ../../mls_english/ASR/prepare.sh --stage 1 --stop-stage 1" exit 1 fi fi log "Dataset: ReazonSpeech" if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 2: Soft link fbank of ReazonSpeech" - mkdir -p data/fbank + log "Stage 3: Soft link fbank of ReazonSpeech" if [ -e ../../reazonspeech/ASR/data/manifests/.reazonspeech.done ]; then - cd data/fbank - ln -svf $(realpath ../../../../reazonspeech/ASR/data/manifests/reazonspeech_cuts*) . - cd .. - mkdir -p manifests - cd manifests - ln -svf $(realpath ../../../../reazonspeech/ASR/data/manifests/feats_*) . - cd ../.. + cd data/manifests + mkdir -p reazonspeech + cd reazonspeech + ln -svf $(realpath ../../../../../reazonspeech/ASR/data/manifests/reazonspeech_cuts*) . + ln -svf $(realpath ../../../../../reazonspeech/ASR/data/manifests/feats*) . + cd ../../.. else log "Abort! Please run ../../reazonspeech/ASR/prepare.sh --stage 0 --stop-stage 2" exit 1 fi fi -# New Stage 3: Prepare char based lang for ReazonSpeech -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then lang_char_dir=data/lang_char - log "Stage 3: Prepare char based lang for ReazonSpeech" + log "Stage 4: Prepare char-based lang for ReazonSpeech" mkdir -p $lang_char_dir # Prepare text @@ -89,7 +90,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then | ./local/text2token.py -t "char" > $lang_char_dir/text fi - # jp word segmentation for text + # Japanese word segmentation if [ ! -f $lang_char_dir/text_words_segmentation ]; then python3 ./local/text2segments.py \ --input-file $lang_char_dir/text \ @@ -106,29 +107,31 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then fi if [ ! -f $lang_char_dir/L_disambig.pt ]; then - python3 ./local/prepare_char.py --lang-dir data/lang_char + python3 ./local/prepare_char.py --lang-dir $lang_char_dir fi fi -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Prepare Byte BPE based lang" - mkdir -p data/fbank + + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare Byte BPE based lang" + + # Check if required char-based lang data exists if [ ! -d ../../reazonspeech/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then log "Abort! Please run ../../reazonspeech/ASR/prepare.sh --stage 3 --stop-stage 3" exit 1 fi - if [ ! -d ../../librispeech/ASR/data/lang_bpe_500 ] && [ ! -d ./data/lang_bpe_500 ]; then - log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 5 --stop-stage 5" + # Check if BPE data from MLS English exists + if [ ! -d ../../mls_english/ASR/data/lang ] ; then + log "Abort! Please run ../../mls_english/ASR/prepare.sh --stage 3 --stop-stage 3" exit 1 fi cd data/ - # if [ ! -d ./lang_char ]; then - # ln -svf $(realpath ../../../reazonspeech/ASR/data/lang_char) . - # fi - if [ ! -d ./lang_bpe_500 ]; then - ln -svf $(realpath ../../../librispeech/ASR/data/lang_bpe_500) . + # Symlink the full lang directory from MLS English (which includes bpe_2000/ and transcript.txt) + if [ ! -d ./lang_bpe_2000 ]; then + ln -svf $(realpath ../../../mls_english/ASR/data/lang) lang_bpe_2000 fi cd ../ @@ -136,7 +139,8 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then lang_dir=data/lang_bbpe_${vocab_size} mkdir -p $lang_dir - cat data/lang_char/text data/lang_bpe_500/transcript_words.txt \ + # Combine Japanese char-level text and English BPE transcript + cat data/lang_char/text data/lang_bpe_2000/transcript.txt \ > $lang_dir/text if [ ! -f $lang_dir/transcript_chars.txt ]; then @@ -150,7 +154,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then --input-file ./data/lang_char/text \ --output-file $lang_dir/text_words_segmentation - cat ./data/lang_bpe_500/transcript_words.txt \ + cat ./data/lang_bpe_2000/transcript.txt \ >> $lang_dir/text_words_segmentation fi