From c43d4ced9eedb22ad34c8ee1de9404168a6fb98a Mon Sep 17 00:00:00 2001
From: Yifan Yang <yifanyeung@qq.com>
Date: Wed, 14 Jun 2023 18:18:14 +0800
Subject: [PATCH] Add prepare.sh

---
 egs/multi_en/ASR/prepare.sh | 170 ++++++++++++++++++++++++++++++++++++
 egs/multi_en/ASR/shared     |   1 +
 2 files changed, 171 insertions(+)
 create mode 100755 egs/multi_en/ASR/prepare.sh
 create mode 120000 egs/multi_en/ASR/shared

diff --git a/egs/multi_en/ASR/prepare.sh b/egs/multi_en/ASR/prepare.sh
new file mode 100755
index 000000000..65969a913
--- /dev/null
+++ b/egs/multi_en/ASR/prepare.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+set -eou pipefail
+
+nj=16
+stage=-1
+stop_stage=100
+
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# vocab size for sentence piece models.
+# It will generate data/lang_bpe_xxx,
+# data/lang_bpe_yyy if the array contains xxx, yyy
+vocab_sizes=(
+  # 5000
+  # 2000
+  # 1000
+  500
+)
+
+# multidataset list.
+# LibriSpeech and musan are required.
+# The others are optional.
+multidataset=(
+  "gigaspeech",
+  "commonvoice",
+  "peoples_speech",
+)
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+log "Dataset: musan"
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Soft link fbank of musan"
+  mkdir -p data/fbank
+  if [ -e ../../librispeech/ASR/data/fbank/.musan.done ]; then
+    cd data/fbank
+    ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_feats) .
+    ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) .
+    cd ../..
+  else
+    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 4 --stop-stage 4"
+    exit 1
+  fi
+fi
+
+log "Dataset: LibriSpeech"
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Soft link fbank of librispeech"
+  mkdir -p data/fbank
+  if [ -e ../../librispeech/ASR/data/fbank/.librispeech.done ]; then
+    cd data/fbank
+    ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz) .
+    cd ../..
+  else
+    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 3 --stop-stage 3"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Soft link phone based lang"
+  if [ -e ../../librispeech/ASR/data/lang_phone/L_disambig.pt ]; then
+    cd data
+    ln -svf $(realpath ../../../librispeech/ASR/data/lang_phone) .
+    cd ..
+  else
+    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 5 --stop-stage 5"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Soft link BPE based lang"
+  cd data
+  for vocab_size in ${vocab_sizes[@]}; do
+    if [ -e ../../../librispeech/ASR/data/lang_bpe_${vocab_size}/L_disambig.pt ]; then
+      ln -svf $(realpath ../../../librispeech/ASR/data/lang_bpe_${vocab_size}) .
+    else
+      log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 6 --stop-stage 6"
+      exit 1
+    fi
+  done
+  cd ..
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Soft link G"
+
+  mkdir -p data/lm
+  cd data/lm
+  if [ -f ../../../../librispeech/ASR/data/lm/G_3_gram.fst.txt ]; then
+    ln -svf $(realpath ../../../../librispeech/ASR/data/lm/G_3_gram.fst.txt) .
+  else
+    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 8 --stop-stage 8"
+    exit 1
+  fi
+
+  if [ -f ../../../../librispeech/ASR/data/lm/G_4_gram.fst.txt ]; then
+    ln -svf $(realpath ../../../../librispeech/ASR/data/lm/G_4_gram.fst.txt) .
+  else
+    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 8 --stop-stage 8"
+    exit 1
+  fi
+  cd ../..
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Prepare the other datasets"
+  # GigaSpeech
+  if [[ "${multidataset[@]}" =~ "gigaspeech" ]] && [ ! -f data/fbank/.gigaspeech.done ]; then
+    log "Dataset: GigaSpeech"
+    cd data/fbank
+    if [ -f ../../../../gigaspeech/ASR/data/fbank/XL_split/.split_completed ]; then
+      ln -svf $(realpath ../../../../gigaspeech/ASR/data/fbank/XL_split) .
+    else
+      log "Abort! Please run ../../gigaspeech/ASR/prepare.sh --stage 5 --stop-stage 6"
+      exit 1
+    fi
+
+    touch .gigaspeech.done
+    cd ../..
+  fi
+
+  # CommonVoice
+  if [[ "${multidataset[@]}" =~ "commonvoice" ]] && [ ! -f data/fbank/.commonvoice.done ]; then
+    log "Dataset: CommonVoice"
+    cd data/fbank
+    if [ -f ../../../../commonvoice/ASR/data/en/fbank/.cv-en_train.done ]; then
+      ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_train_split_1000) .
+      ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_cuts_train.jsonl.gz) .
+    else
+      log "Abort! Please run ../../commonvoice/ASR/prepare.sh --stage 5 --stop-stage 6"
+      exit 1
+    fi
+
+    touch .commonvoice.done
+    cd ../..
+  fi
+
+  # People's Speech
+  if [[ "${multidataset[@]}" =~ "peoples_speech" ]] && [ ! -f data/fbank/.peoples_speech.done ]; then
+    log "Dataset: People's Speech"
+    cd data/fbank
+    if [ -f ../../../../peoples_speech/ASR/data/fbank/.peoples_speech_train.done ]; then
+      ln -svf $(realpath ../../../../peoples_speech/ASR/data/fbank/peoples_speech_train_split) .
+    else
+      log "Abort! Please run ../../peoples_speech/ASR/prepare.sh --stage 5 --stop-stage 6"
+      exit 1
+    fi
+
+    touch .peoples_speech.done
+    cd ../..
+  fi
+fi
diff --git a/egs/multi_en/ASR/shared b/egs/multi_en/ASR/shared
new file mode 120000
index 000000000..4c5e91438
--- /dev/null
+++ b/egs/multi_en/ASR/shared
@@ -0,0 +1 @@
+../../../icefall/shared/
\ No newline at end of file