From d7f9dacf0d10c6fa603bde293ef98009b85b6ab2 Mon Sep 17 00:00:00 2001 From: PingFeng Luo Date: Fri, 3 Dec 2021 14:33:10 +0800 Subject: [PATCH] use a faster way to get the intersection of train and aishell_transcript_v0.8.txt --- egs/aishell/ASR/prepare.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/egs/aishell/ASR/prepare.sh b/egs/aishell/ASR/prepare.sh index 3e388be59..1e78d79d9 100755 --- a/egs/aishell/ASR/prepare.sh +++ b/egs/aishell/ASR/prepare.sh @@ -130,11 +130,10 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then # Train a bigram P for MMI training if [ ! -f $lang_phone_dir/transcript_words.txt ]; then log "Generate data to train phone based bigram P" - train_uids=$(find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}') aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt - for uid in ${train_uids}; do - awk -v uid=$uid 'index($1, uid)' $aishell_text | cut -d " " -f 2- - done > $lang_phone_dir/transcript_words.txt + aishell_train_uid=$dl_dir/aishell/data_aishell/transcript/aishell_train_uid + find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_train_uid + awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_train_uid $aishell_text | cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt fi if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then