mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 10:44:19 +00:00
use a faster way to get the intersection of train and aishell_transcript_v0.8.txt
This commit is contained in:
parent
00b5ac5815
commit
d7f9dacf0d
@ -130,11 +130,10 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
# Train a bigram P for MMI training
|
||||
if [ ! -f $lang_phone_dir/transcript_words.txt ]; then
|
||||
log "Generate data to train phone based bigram P"
|
||||
train_uids=$(find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}')
|
||||
aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||
for uid in ${train_uids}; do
|
||||
awk -v uid=$uid 'index($1, uid)' $aishell_text | cut -d " " -f 2-
|
||||
done > $lang_phone_dir/transcript_words.txt
|
||||
aishell_train_uid=$dl_dir/aishell/data_aishell/transcript/aishell_train_uid
|
||||
find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_train_uid
|
||||
awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_train_uid $aishell_text | cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then
|
||||
|
Loading…
x
Reference in New Issue
Block a user