use a faster way to get the intersection of train and aishell_transcript_v0.8.txt

2025-12-11 06:55:27 +00:00 · 2021-12-03 14:33:10 +08:00 · 2021-12-03 14:33:10 +08:00 · d7f9dacf0d
commit d7f9dacf0d
parent 00b5ac5815
1 changed files with 3 additions and 4 deletions
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@ -130,11 +130,10 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  # Train a bigram P for MMI training
  if [ ! -f $lang_phone_dir/transcript_words.txt ]; then
    log "Generate data to train phone based bigram P"
    train_uids=$(find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}')
    aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
-    for uid in ${train_uids}; do
+    aishell_train_uid=$dl_dir/aishell/data_aishell/transcript/aishell_train_uid
-	awk -v uid=$uid 'index($1, uid)' $aishell_text | cut -d " " -f 2-
+    find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_train_uid
-    done > $lang_phone_dir/transcript_words.txt
+    awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_train_uid $aishell_text | cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt
  fi
  if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then