From a9df06cef4f991faba95864c0d58ce54bc5df546 Mon Sep 17 00:00:00 2001 From: jinzr Date: Tue, 12 Mar 2024 12:34:27 +0800 Subject: [PATCH] Update prepare.sh --- egs/commonvoice/ASR/prepare.sh | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/egs/commonvoice/ASR/prepare.sh b/egs/commonvoice/ASR/prepare.sh index f01ae5b12..50cd203c7 100755 --- a/egs/commonvoice/ASR/prepare.sh +++ b/egs/commonvoice/ASR/prepare.sh @@ -181,14 +181,14 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then log "Generate data for lang preparation" - file=$( - find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz" - ) - gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text - # Ensure space only appears once - sed -i 's/\t/ /g' $lang_dir/text - sed -i 's/[ ][ ]*/ /g' $lang_dir/text + # Prepare text. + # Note: in Linux, you can install jq with the following command: + # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 + # 2. chmod +x ./jq + # 3. cp jq /usr/bin + gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \ + | jq '.text' | sed 's/"//g' > $lang_char_dir/text if [ $lang == "yue" ]; then # Get words.txt and words_no_ids.txt @@ -218,7 +218,13 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then file=$( find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz" ) - gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt + # Prepare text. + # Note: in Linux, you can install jq with the following command: + # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 + # 2. chmod +x ./jq + # 3. cp jq /usr/bin + gunzip -c ${file} \ + | jq '.text' | sed 's/"//g' > $lang_dir/transcript_words.txt # Ensure space only appears once sed -i 's/\t/ /g' $lang_dir/transcript_words.txt