mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-09 14:05:33 +00:00
change transcript_words.txt
This commit is contained in:
parent
597d9b60d4
commit
47e49a6663
@ -121,7 +121,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
--lang-dir $lang_dir \
|
||||
--manifests-dir data/manifests
|
||||
|
||||
cat download/tedlium3/TEDLIUM.152k.dic | grep -v -w "<s>" | grep -v -w "</s>" | grep -v -w "<unk>" | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt
|
||||
cat download/tedlium3/TEDLIUM.152k.dic |
|
||||
grep -v -w "<s>" |
|
||||
grep -v -w "</s>" |
|
||||
grep -v -w "<unk>" |
|
||||
LANG= LC_ALL= sort |
|
||||
sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt
|
||||
|
||||
(echo '<UNK> <UNK>'; ) |
|
||||
cat - $lang_dir/lexicon_words.txt |
|
||||
@ -146,6 +151,9 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Generate data for BPE training"
|
||||
cat data/lang_phone/train.text | cut -d " " -f 2-
|
||||
> $lang_dir/transcript_words.txt
|
||||
sed -i 's/ <unk>//g' $lang_dir/transcript_words.txt
|
||||
sed -i 's/<unk> //g' $lang_dir/transcript_words.txt
|
||||
sed -i 's/<unk>//g' $lang_dir/transcript_words.txt
|
||||
fi
|
||||
|
||||
./local/train_bpe_model.py \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user