mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 10:44:19 +00:00
use a faster way to get the intersection of train and aishell_transcript_v0.8.txt
This commit is contained in:
parent
00b5ac5815
commit
d7f9dacf0d
@ -130,11 +130,10 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
# Train a bigram P for MMI training
|
# Train a bigram P for MMI training
|
||||||
if [ ! -f $lang_phone_dir/transcript_words.txt ]; then
|
if [ ! -f $lang_phone_dir/transcript_words.txt ]; then
|
||||||
log "Generate data to train phone based bigram P"
|
log "Generate data to train phone based bigram P"
|
||||||
train_uids=$(find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}')
|
|
||||||
aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||||
for uid in ${train_uids}; do
|
aishell_train_uid=$dl_dir/aishell/data_aishell/transcript/aishell_train_uid
|
||||||
awk -v uid=$uid 'index($1, uid)' $aishell_text | cut -d " " -f 2-
|
find data/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_train_uid
|
||||||
done > $lang_phone_dir/transcript_words.txt
|
awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_train_uid $aishell_text | cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then
|
if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then
|
||||||
|
Loading…
x
Reference in New Issue
Block a user