fix tal_csasr data pre-processing (#898)

This commit is contained in:
KajiMaCN 2023-02-10 21:28:19 +08:00 committed by GitHub
parent cba6ecc1d1
commit 57604aac34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -12,9 +12,12 @@ stop_stage=100
# directories and files. If not, they will be downloaded # directories and files. If not, they will be downloaded
# by this script automatically. # by this script automatically.
# #
# - $dl_dir/tal_csasr # - $dl_dir/TALCS_corpus
# You can find three directories:train_set, dev_set, and test_set. # You can find three directories:train_set, dev_set, and test_set.
# You can get it from https://ai.100tal.com/dataset # You can get it from https://ai.100tal.com/dataset
# - dev_set
# - test_set
# - train_set
# #
# - $dl_dir/musan # - $dl_dir/musan
# This directory contains the following directories downloaded from # This directory contains the following directories downloaded from
@ -44,7 +47,9 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data" log "Stage 0: Download data"
# Before you run this script, you must get the TAL_CSASR dataset # Before you run this script, you must get the TAL_CSASR dataset
# from https://ai.100tal.com/dataset # from https://ai.100tal.com/dataset
if [ ! -d $dl_dir/tal_csasr/TALCS_corpus ]; then
mv $dl_dir/TALCS_corpus $dl_dir/tal_csasr mv $dl_dir/TALCS_corpus $dl_dir/tal_csasr
fi
# If you have pre-downloaded it to /path/to/TALCS_corpus, # If you have pre-downloaded it to /path/to/TALCS_corpus,
# you can create a symlink # you can create a symlink