This commit is contained in:
Yifan Yang 2023-05-31 11:23:17 +08:00
parent 282491429a
commit 3acbc2d44b
2 changed files with 5 additions and 2 deletions

View File

@ -87,7 +87,7 @@ def compute_fbank_commonvoice_splits(args):
output_dir = Path(output_dir)
assert output_dir.exists(), f"{output_dir} does not exist!"
num_digits = len(str(num_splits))
num_digits = 8
start = args.start
stop = args.stop

View File

@ -173,7 +173,10 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
if [ ! -f $lang_dir/transcript_words.txt ]; then
log "Generate data for BPE training"
file=$(
find "data/fbank/peoples_speech_cuts_train.jsonl.gz"
find "data/fbank/peoples_speech_cuts_dirty_raw.jsonl.gz"
find "data/fbank/peoples_speech_cuts_dirty_sa_raw.jsonl.gz"
find "data/fbank/peoples_speech_cuts_clean_raw.jsonl.gz"
find "data/fbank/peoples_speech_cuts_clean_sa_raw.jsonl.gz"
)
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt