use a faster way to get the intersection of train and aishell_transcript_v0.8.txt

This commit is contained in:
PingFeng Luo 2021-12-03 14:33:10 +08:00
parent 00b5ac5815
commit d7f9dacf0d

View File

@ -130,11 +130,10 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
# Train a bigram P for MMI training # Train a bigram P for MMI training
if [ ! -f $lang_phone_dir/transcript_words.txt ]; then if [ ! -f $lang_phone_dir/transcript_words.txt ]; then
log "Generate data to train phone based bigram P" log "Generate data to train phone based bigram P"
train_uids=$(find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}')
aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
for uid in ${train_uids}; do aishell_train_uid=$dl_dir/aishell/data_aishell/transcript/aishell_train_uid
awk -v uid=$uid 'index($1, uid)' $aishell_text | cut -d " " -f 2- find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_train_uid
done > $lang_phone_dir/transcript_words.txt awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_train_uid $aishell_text | cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt
fi fi
if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then