From 2bced72c1ba5d119b7ad8945573c76bb77723ae2 Mon Sep 17 00:00:00 2001 From: dohe0342 Date: Fri, 9 Jun 2023 14:49:46 +0900 Subject: [PATCH] from local --- egs/tedlium3/ASR/.prepare.sh.swp | Bin 16384 -> 16384 bytes egs/tedlium3/ASR/2 | 231 +++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 egs/tedlium3/ASR/2 diff --git a/egs/tedlium3/ASR/.prepare.sh.swp b/egs/tedlium3/ASR/.prepare.sh.swp index d2be350273ffe69948045ed72a1dcf58de98b8c7..bddf02a0d88b82cf85cba0ac1e75ece3cc36ecfb 100644 GIT binary patch delta 40 ucmZo@U~Fh$6iYG)^Ym4))H7fJ0s#hw)V)n9ssbCuZZb25ZvMb5U;_Zn data/fbank/tedlium_cuts_train-shuf.jsonl.gz + mv data/fbank/tedlium_cuts_train-shuf.jsonl.gz \ + data/fbank/tedlium_cuts_train.jsonl.gz + + touch data/fbank/.tedlium3.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compute fbank for musan" + if [ ! -e data/fbank/.musan.done ]; then + mkdir -p data/fbank + python3 ./local/compute_fbank_musan.py + touch data/fbank/.musan.done + fi +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare BPE train data and set of words" + lang_dir=data/lang + mkdir -p $lang_dir + + if [ ! -f $lang_dir/train.txt ]; then + gunzip -c $dl_dir/tedlium3/LM/*.en.gz | sed 's: <\/s>::g' > $lang_dir/train_orig.txt + + ./local/prepare_transcripts.py \ + --input-text-path $lang_dir/train_orig.txt \ + --output-text-path $lang_dir/train.txt + fi + + if [ ! -f $lang_dir/words.txt ]; then + + awk '{print $1}' $dl_dir/tedlium3/TEDLIUM.152k.dic | + sed 's:([0-9])::g' | sort | uniq > $lang_dir/words_orig.txt + + ./local/prepare_words.py --lang-dir $lang_dir + fi +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Prepare BPE based lang" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang/words.txt $lang_dir + + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript data/lang/train.txt + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir --oov "" + fi + done +fi + +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Prepare G" + # We assume you have install kaldilm, if not, please install + # it using: pip install kaldilm + + mkdir -p data/lm + if [ ! -f data/lm/G_4_gram_small.fst.txt ]; then + # It is used in building HLG + python3 -m kaldilm \ + --read-symbol-table="data/lang/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + --max-arpa-warnings=-1 \ + $dl_dir/lm/4gram_small.arpa > data/lm/G_4_gram_small.fst.txt + fi + + if [ ! -f data/lm/G_4_gram_big.fst.txt ]; then + # It is used for LM rescoring + python3 -m kaldilm \ + --read-symbol-table="data/lang/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + --max-arpa-warnings=-1 \ + $dl_dir/lm/4gram_big.arpa > data/lm/G_4_gram_big.fst.txt + fi +fi + +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Compile HLG" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + + if [ ! -f $lang_dir/HLG.pt ]; then + ./local/compile_hlg.py \ + --lang-dir $lang_dir \ + --lm G_4_gram_small + fi + done +fi + +if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then + log "Stage 9: Split cuts by speaker id" + gzip -d data/fbank/tedlium_cuts_test.jsonl.gz + + i=0 + for spk in $dl_dir/tedlium3/legacy/test/sph/*; do + spk_id=${spk#*sph\/} + spk_id=${spk_id%.sph} + echo $spk_id + cat data/fbank/tedlium_cuts_test.jsonl | grep speaker\":\ \"$spk_id\" > data/fbank/tedlium_cuts_test_$i.jsonl + gzip data/fbank/tedlium_cuts_test_$i.jsonl + i=`expr $i + 1` + done + + gzip data/fbank/tedlium_cuts_test.jsonl + #cat data/fbank/tedlium_cuts_test.jsonl.gz | grep + +fi