From 3dbb15bda297576ef26fab57b7b65d8c7067f96a Mon Sep 17 00:00:00 2001 From: wgb14 Date: Sat, 13 Nov 2021 23:27:45 -0500 Subject: [PATCH] support BPE based lang --- .../convert_transcript_words_to_tokens.py | 1 + egs/gigaspeech/ASR/local/prepare_lang.py | 1 + egs/gigaspeech/ASR/local/prepare_lang_bpe.py | 1 + egs/gigaspeech/ASR/local/train_bpe_model.py | 1 + egs/gigaspeech/ASR/prepare.sh | 92 ++++++++++++++----- 5 files changed, 72 insertions(+), 24 deletions(-) create mode 120000 egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py create mode 120000 egs/gigaspeech/ASR/local/prepare_lang.py create mode 120000 egs/gigaspeech/ASR/local/prepare_lang_bpe.py create mode 120000 egs/gigaspeech/ASR/local/train_bpe_model.py diff --git a/egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py b/egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py new file mode 120000 index 000000000..2ce13fd69 --- /dev/null +++ b/egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/local/prepare_lang.py b/egs/gigaspeech/ASR/local/prepare_lang.py new file mode 120000 index 000000000..747f2ab39 --- /dev/null +++ b/egs/gigaspeech/ASR/local/prepare_lang.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/local/prepare_lang_bpe.py b/egs/gigaspeech/ASR/local/prepare_lang_bpe.py new file mode 120000 index 000000000..36b40e7fc --- /dev/null +++ b/egs/gigaspeech/ASR/local/prepare_lang_bpe.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang_bpe.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/local/train_bpe_model.py b/egs/gigaspeech/ASR/local/train_bpe_model.py new file mode 120000 index 000000000..6fad36421 --- /dev/null +++ b/egs/gigaspeech/ASR/local/train_bpe_model.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/train_bpe_model.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh index 46f99b6b2..47ed2dc5f 100755 --- a/egs/gigaspeech/ASR/prepare.sh +++ b/egs/gigaspeech/ASR/prepare.sh @@ -31,9 +31,9 @@ dl_dir=$PWD/download # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( 5000 - 2000 - 1000 - 500 + # 2000 + # 1000 + # 500 ) # All files generated by this script are saved in "data". @@ -125,15 +125,61 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then lang_dir=data/lang_phone mkdir -p $lang_dir - (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | - cat - $dl_dir/lm/librispeech-lexicon.txt | - sort | uniq > $lang_dir/lexicon.txt + # (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | + # cat - $dl_dir/lm/librispeech-lexicon.txt | + # sort | uniq > $lang_dir/lexicon.txt - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang.py --lang-dir $lang_dir + # if [ ! -f $lang_dir/L_disambig.pt ]; then + # ./local/prepare_lang.py --lang-dir $lang_dir + # fi + if [ ! -f $lang_dir/transcript_words.txt ]; then + gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \ + | jq '.text' \ + | sed 's/"//g' \ + > $lang_dir/transcript_words.txt + + # Delete utterances with garbage meta tags + garbage_utterance_tags=" " + for tag in $garbage_utterance_tags; do + sed -i "/${tag}/d" $lang_dir/transcript_words.txt + done + + # Delete punctuations in utterances + punctuation_tags=" " + for tag in $punctuation_tags; do + sed -i "s/${tag}//g" $lang_dir/transcript_words.txt + done + + # Ensure space only appears once + sed -i 's/\t/ /g' $lang_dir/transcript_words.txt + sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt fi -fi + cat $lang_dir/transcript_words.txt | sed 's| |\n|g' \ + | sort -u | sed '/^$/d' > $lang_dir/words.txt + (echo '!SIL'; echo ''; echo ''; ) | + cat - $lang_dir/words.txt | sort | uniq | awk ' + BEGIN { + print " 0"; + } + { + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + printf("%s %d\n", $1, NR); + } + END { + printf("#0 %d\n", NR+1); + printf(" %d\n", NR+2); + printf(" %d\n", NR+3); + }' > $lang_dir/words || exit 1; + mv $lang_dir/words $lang_dir/words.txt +fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "Stage 6: Prepare BPE based lang" @@ -141,26 +187,24 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size} mkdir -p $lang_dir - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. - cp data/lang_phone/words.txt $lang_dir + # # We reuse words.txt from phone based lexicon + # # so that the two can share G.pt later. + cp data/lang_phone/{words.txt,transcript_words.txt} $lang_dir if [ ! -f $lang_dir/transcript_words.txt ]; then log "Generate data for BPE training" - files=$( - find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" - ) - for f in ${files[@]}; do - cat $f | cut -d " " -f 2- - done > $lang_dir/transcript_words.txt + gunzip -c "data/manifests/gigaspeech_supervisions_XL.jsonl.gz" \ + | jq '.text' \ + | sed 's/"//g' \ + > $lang_dir/transcript_words.txt fi - ./local/train_bpe_model.py \ - --lang-dir $lang_dir \ - --vocab-size $vocab_size \ - --transcript $lang_dir/transcript_words.txt + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_words.txt + fi if [ ! -f $lang_dir/L_disambig.pt ]; then ./local/prepare_lang_bpe.py --lang-dir $lang_dir