From 47e49a6663d268813357404324a5c59ff5e398b2 Mon Sep 17 00:00:00 2001
From: Mingshuang Luo <37799481+luomingshuang@users.noreply.github.com>
Date: Tue, 15 Feb 2022 12:33:51 +0800
Subject: [PATCH] change transcript_words.txt
---
egs/tedlium3/ASR/prepare.sh | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/egs/tedlium3/ASR/prepare.sh b/egs/tedlium3/ASR/prepare.sh
index 55b0f8d29..053cc3941 100644
--- a/egs/tedlium3/ASR/prepare.sh
+++ b/egs/tedlium3/ASR/prepare.sh
@@ -121,7 +121,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
--lang-dir $lang_dir \
--manifests-dir data/manifests
- cat download/tedlium3/TEDLIUM.152k.dic | grep -v -w "" | grep -v -w "" | grep -v -w "" | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt
+ cat download/tedlium3/TEDLIUM.152k.dic |
+ grep -v -w "" |
+ grep -v -w "" |
+ grep -v -w "" |
+ LANG= LC_ALL= sort |
+ sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt
(echo ' '; ) |
cat - $lang_dir/lexicon_words.txt |
@@ -146,6 +151,9 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Generate data for BPE training"
cat data/lang_phone/train.text | cut -d " " -f 2-
> $lang_dir/transcript_words.txt
+ sed -i 's/ //g' $lang_dir/transcript_words.txt
+ sed -i 's/ //g' $lang_dir/transcript_words.txt
+ sed -i 's///g' $lang_dir/transcript_words.txt
fi
./local/train_bpe_model.py \