diff --git a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py b/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py index 7e8f166a6..e386fb450 100755 --- a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py +++ b/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py @@ -87,7 +87,7 @@ def compute_fbank_commonvoice_splits(args): output_dir = Path(output_dir) assert output_dir.exists(), f"{output_dir} does not exist!" - num_digits = len(str(num_splits)) + num_digits = 8 start = args.start stop = args.stop diff --git a/egs/peoples_speech/ASR/prepare.sh b/egs/peoples_speech/ASR/prepare.sh index 2511d2801..e62fcd001 100755 --- a/egs/peoples_speech/ASR/prepare.sh +++ b/egs/peoples_speech/ASR/prepare.sh @@ -173,7 +173,10 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then log "Generate data for BPE training" file=$( - find "data/fbank/peoples_speech_cuts_train.jsonl.gz" + find "data/fbank/peoples_speech_cuts_dirty_raw.jsonl.gz" + find "data/fbank/peoples_speech_cuts_dirty_sa_raw.jsonl.gz" + find "data/fbank/peoples_speech_cuts_clean_raw.jsonl.gz" + find "data/fbank/peoples_speech_cuts_clean_sa_raw.jsonl.gz" ) gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt