From c43d4ced9eedb22ad34c8ee1de9404168a6fb98a Mon Sep 17 00:00:00 2001 From: Yifan Yang Date: Wed, 14 Jun 2023 18:18:14 +0800 Subject: [PATCH] Add prepare.sh --- egs/multi_en/ASR/prepare.sh | 170 ++++++++++++++++++++++++++++++++++++ egs/multi_en/ASR/shared | 1 + 2 files changed, 171 insertions(+) create mode 100755 egs/multi_en/ASR/prepare.sh create mode 120000 egs/multi_en/ASR/shared diff --git a/egs/multi_en/ASR/prepare.sh b/egs/multi_en/ASR/prepare.sh new file mode 100755 index 000000000..65969a913 --- /dev/null +++ b/egs/multi_en/ASR/prepare.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +nj=16 +stage=-1 +stop_stage=100 + +dl_dir=$PWD/download + +. shared/parse_options.sh || exit 1 + +# vocab size for sentence piece models. +# It will generate data/lang_bpe_xxx, +# data/lang_bpe_yyy if the array contains xxx, yyy +vocab_sizes=( + # 5000 + # 2000 + # 1000 + 500 +) + +# multidataset list. +# LibriSpeech and musan are required. +# The others are optional. +multidataset=( + "gigaspeech", + "commonvoice", + "peoples_speech", +) + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +log "Dataset: musan" +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Soft link fbank of musan" + mkdir -p data/fbank + if [ -e ../../librispeech/ASR/data/fbank/.musan.done ]; then + cd data/fbank + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_feats) . + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) . + cd ../.. + else + log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 4 --stop-stage 4" + exit 1 + fi +fi + +log "Dataset: LibriSpeech" +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Soft link fbank of librispeech" + mkdir -p data/fbank + if [ -e ../../librispeech/ASR/data/fbank/.librispeech.done ]; then + cd data/fbank + ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz) . + cd ../.. + else + log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 3 --stop-stage 3" + exit 1 + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Soft link phone based lang" + if [ -e ../../librispeech/ASR/data/lang_phone/L_disambig.pt ]; then + cd data + ln -svf $(realpath ../../../librispeech/ASR/data/lang_phone) . + cd .. + else + log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 5 --stop-stage 5" + exit 1 + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Soft link BPE based lang" + cd data + for vocab_size in ${vocab_sizes[@]}; do + if [ -e ../../../librispeech/ASR/data/lang_bpe_${vocab_size}/L_disambig.pt ]; then + ln -svf $(realpath ../../../librispeech/ASR/data/lang_bpe_${vocab_size}) . + else + log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 6 --stop-stage 6" + exit 1 + fi + done + cd .. +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Soft link G" + + mkdir -p data/lm + cd data/lm + if [ -f ../../../../librispeech/ASR/data/lm/G_3_gram.fst.txt ]; then + ln -svf $(realpath ../../../../librispeech/ASR/data/lm/G_3_gram.fst.txt) . + else + log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 8 --stop-stage 8" + exit 1 + fi + + if [ -f ../../../../librispeech/ASR/data/lm/G_4_gram.fst.txt ]; then + ln -svf $(realpath ../../../../librispeech/ASR/data/lm/G_4_gram.fst.txt) . + else + log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 8 --stop-stage 8" + exit 1 + fi + cd ../.. +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Prepare the other datasets" + # GigaSpeech + if [[ "${multidataset[@]}" =~ "gigaspeech" ]] && [ ! -f data/fbank/.gigaspeech.done ]; then + log "Dataset: GigaSpeech" + cd data/fbank + if [ -f ../../../../gigaspeech/ASR/data/fbank/XL_split/.split_completed ]; then + ln -svf $(realpath ../../../../gigaspeech/ASR/data/fbank/XL_split) . + else + log "Abort! Please run ../../gigaspeech/ASR/prepare.sh --stage 5 --stop-stage 6" + exit 1 + fi + + touch .gigaspeech.done + cd ../.. + fi + + # CommonVoice + if [[ "${multidataset[@]}" =~ "commonvoice" ]] && [ ! -f data/fbank/.commonvoice.done ]; then + log "Dataset: CommonVoice" + cd data/fbank + if [ -f ../../../../commonvoice/ASR/data/en/fbank/.cv-en_train.done ]; then + ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_train_split_1000) . + ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_cuts_train.jsonl.gz) . + else + log "Abort! Please run ../../commonvoice/ASR/prepare.sh --stage 5 --stop-stage 6" + exit 1 + fi + + touch .commonvoice.done + cd ../.. + fi + + # People's Speech + if [[ "${multidataset[@]}" =~ "peoples_speech" ]] && [ ! -f data/fbank/.peoples_speech.done ]; then + log "Dataset: People's Speech" + cd data/fbank + if [ -f ../../../../peoples_speech/ASR/data/fbank/.peoples_speech_train.done ]; then + ln -svf $(realpath ../../../../peoples_speech/ASR/data/fbank/peoples_speech_train_split) . + else + log "Abort! Please run ../../peoples_speech/ASR/prepare.sh --stage 5 --stop-stage 6" + exit 1 + fi + + touch .peoples_speech.done + cd ../.. + fi +fi diff --git a/egs/multi_en/ASR/shared b/egs/multi_en/ASR/shared new file mode 120000 index 000000000..4c5e91438 --- /dev/null +++ b/egs/multi_en/ASR/shared @@ -0,0 +1 @@ +../../../icefall/shared/ \ No newline at end of file