From 3acbc2d44be8cf14f4d68b9e50e6457fb4cecbc5 Mon Sep 17 00:00:00 2001 From: Yifan Yang Date: Wed, 31 May 2023 11:23:17 +0800 Subject: [PATCH] update --- .../ASR/local/compute_fbank_peoples_speech_splits.py | 2 +- egs/peoples_speech/ASR/prepare.sh | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py b/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py index 7e8f166a6..e386fb450 100755 --- a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py +++ b/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py @@ -87,7 +87,7 @@ def compute_fbank_commonvoice_splits(args): output_dir = Path(output_dir) assert output_dir.exists(), f"{output_dir} does not exist!" - num_digits = len(str(num_splits)) + num_digits = 8 start = args.start stop = args.stop diff --git a/egs/peoples_speech/ASR/prepare.sh b/egs/peoples_speech/ASR/prepare.sh index 2511d2801..e62fcd001 100755 --- a/egs/peoples_speech/ASR/prepare.sh +++ b/egs/peoples_speech/ASR/prepare.sh @@ -173,7 +173,10 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then log "Generate data for BPE training" file=$( - find "data/fbank/peoples_speech_cuts_train.jsonl.gz" + find "data/fbank/peoples_speech_cuts_dirty_raw.jsonl.gz" + find "data/fbank/peoples_speech_cuts_dirty_sa_raw.jsonl.gz" + find "data/fbank/peoples_speech_cuts_clean_raw.jsonl.gz" + find "data/fbank/peoples_speech_cuts_clean_sa_raw.jsonl.gz" ) gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt