Update prepare.sh

This commit is contained in:
jinzr 2024-03-12 12:34:27 +08:00
parent 9820bf92f6
commit a9df06cef4

View File

@ -181,14 +181,14 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
if [ ! -f $lang_dir/transcript_words.txt ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then
log "Generate data for lang preparation" log "Generate data for lang preparation"
file=$(
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
)
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text
# Ensure space only appears once # Prepare text.
sed -i 's/\t/ /g' $lang_dir/text # Note: in Linux, you can install jq with the following command:
sed -i 's/[ ][ ]*/ /g' $lang_dir/text # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
| jq '.text' | sed 's/"//g' > $lang_char_dir/text
if [ $lang == "yue" ]; then if [ $lang == "yue" ]; then
# Get words.txt and words_no_ids.txt # Get words.txt and words_no_ids.txt
@ -218,7 +218,13 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
file=$( file=$(
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz" find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
) )
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt # Prepare text.
# Note: in Linux, you can install jq with the following command:
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
gunzip -c ${file} \
| jq '.text' | sed 's/"//g' > $lang_dir/transcript_words.txt
# Ensure space only appears once # Ensure space only appears once
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt sed -i 's/\t/ /g' $lang_dir/transcript_words.txt