fix tal_csasr data pre-processing (#898)

This commit is contained in:
KajiMaCN 2023-02-10 21:28:19 +08:00 committed by GitHub
parent cba6ecc1d1
commit 57604aac34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -12,9 +12,12 @@ stop_stage=100
# directories and files. If not, they will be downloaded
# by this script automatically.
#
# - $dl_dir/tal_csasr
# - $dl_dir/TALCS_corpus
# You can find three directories:train_set, dev_set, and test_set.
# You can get it from https://ai.100tal.com/dataset
# - dev_set
# - test_set
# - train_set
#
# - $dl_dir/musan
# This directory contains the following directories downloaded from
@ -44,7 +47,9 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# Before you run this script, you must get the TAL_CSASR dataset
# from https://ai.100tal.com/dataset
mv $dl_dir/TALCS_corpus $dl_dir/tal_csasr
if [ ! -d $dl_dir/tal_csasr/TALCS_corpus ]; then
mv $dl_dir/TALCS_corpus $dl_dir/tal_csasr
fi
# If you have pre-downloaded it to /path/to/TALCS_corpus,
# you can create a symlink
@ -116,7 +121,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
fi
# Prepare text.
# Note: in Linux, you can install jq with the following command:
# Note: in Linux, you can install jq with the following command:
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin