#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail stage=-1 stop_stage=100 dl_dir=$PWD/download . shared/parse_options.sh || exit 1 vocab_sizes=( 2000 ) # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data mkdir -p data/lang mkdir -p data/manifests log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } log "dl_dir: $dl_dir" log "Dataset: musan" if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Soft link fbank of musan" if [ -e ../../librispeech/ASR/data/fbank/.musan.done ]; then cd data/manifests mkdir -p musan cd musan ln -svf $(realpath ../../../../../librispeech/ASR/data/fbank/musan_feats) . ln -svf $(realpath ../../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) . cd ../../.. else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 4 --stop-stage 4" exit 1 fi fi log "Dataset: MLS English" if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Soft link manifests (including fbank) of MLS English" if [ -e ../../mls_english/ASR/data/manifests/.mls_english-validated.done ]; then cd data/manifests mkdir -p mls_english cd mls_english ln -svf $(realpath ../../../../../mls_english/ASR/data/manifests/mls_eng_cuts*) . ln -svf $(realpath ../../../../../mls_english/ASR/data/manifests/feats*) . cd ../../.. else log "Abort! Please run ../../mls_english/ASR/prepare.sh --stage 1 --stop-stage 1" exit 1 fi fi log "Dataset: ReazonSpeech" if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Soft link fbank of ReazonSpeech" if [ -e ../../reazonspeech/ASR/data/manifests/.reazonspeech.done ]; then cd data/manifests mkdir -p reazonspeech cd reazonspeech ln -svf $(realpath ../../../../../reazonspeech/ASR/data/manifests/reazonspeech_cuts*) . ln -svf $(realpath ../../../../../reazonspeech/ASR/data/manifests/feats*) . cd ../../.. else log "Abort! Please run ../../reazonspeech/ASR/prepare.sh --stage 0 --stop-stage 2" exit 1 fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then lang_char_dir=data/lang_char log "Stage 4: Prepare char-based lang for ReazonSpeech" mkdir -p $lang_char_dir # Prepare text if [ ! -f $lang_char_dir/text ]; then gunzip -c ../../reazonspeech/ASR/data/manifests/reazonspeech_supervisions_train.jsonl.gz \ | jq '.text' | sed 's/"//g' \ | ./local/text2token.py -t "char" > $lang_char_dir/text fi # Japanese word segmentation if [ ! -f $lang_char_dir/text_words_segmentation ]; then python3 ./local/text2segments.py \ --input-file $lang_char_dir/text \ --output-file $lang_char_dir/text_words_segmentation fi cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \ | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt if [ ! -f $lang_char_dir/words.txt ]; then python3 ./local/prepare_words.py \ --input-file $lang_char_dir/words_no_ids.txt \ --output-file $lang_char_dir/words.txt fi if [ ! -f $lang_char_dir/L_disambig.pt ]; then python3 ./local/prepare_char.py --lang-dir $lang_char_dir fi fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Prepare Byte BPE based lang in data/lang" lang_dir=data/lang # Check if required char-based lang data exists if [ ! -d ../../reazonspeech/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then log "Abort! Please run ../../reazonspeech/ASR/prepare.sh --stage 3 --stop-stage 3" exit 1 fi # Check if BPE data from MLS English exists if [ ! -d ../../mls_english/ASR/data/lang/bpe_2000 ] || [ ! -f ../../mls_english/ASR/data/lang/transcript.txt ]; then log "Abort! Please ensure ../../mls_english/ASR/data/lang/bpe_2000 and ../../mls_english/ASR/data/lang/transcript.txt exist." log "Please run ../../mls_english/ASR/prepare.sh --stage 3 --stop-stage 3 if you haven't already." exit 1 fi # Create the target lang directory if it doesn't exist mkdir -p $lang_dir # Combine Japanese char-level text and English BPE transcript cat data/lang_char/text ../../mls_english/ASR/data/lang/transcript.txt \ > $lang_dir/text for vocab_size in ${vocab_sizes[@]}; do bbpe_dir=$lang_dir/bbpe_${vocab_size} mkdir -p $bbpe_dir if [ ! -f $bbpe_dir/transcript_chars.txt ]; then ./local/prepare_for_bpe_model.py \ --lang-dir $bbpe_dir \ --text $lang_dir/text fi if [ ! -f $bbpe_dir/text_words_segmentation ]; then python3 ./local/text2segments.py \ --input-file ./data/lang_char/text \ --output-file $bbpe_dir/text_words_segmentation cat ../../mls_english/ASR/data/lang/transcript.txt \ >> $bbpe_dir/text_words_segmentation fi if [ ! -f $bbpe_dir/words_no_ids.txt ]; then cat $bbpe_dir/text_words_segmentation | sed 's/ /\n/g' \ | sort -u | sed '/^$/d' | uniq > $bbpe_dir/words_no_ids.txt fi if [ ! -f $bbpe_dir/words.txt ]; then python3 ./local/prepare_words.py \ --input-file $bbpe_dir/words_no_ids.txt \ --output-file $bbpe_dir/words.txt fi if [ ! -f $bbpe_dir/bbpe.model ]; then ./local/train_bbpe_model.py \ --lang-dir $lang_dir \ --vocab-size $vocab_size \ --transcript $lang_dir/text \ --output-model $bbpe_dir/bbpe.model \ --input-sentence-size 2000000 # Example: limit to 2 million sentences fi if [ ! -f $bbpe_dir/L_disambig.pt ]; then ./local/prepare_lang_bbpe.py --lang-dir $bbpe_dir --vocab-size $vocab_size log "Validating $bbpe_dir/lexicon.txt" ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/ ./local/validate_bpe_lexicon.py \ --lexicon $bbpe_dir/lexicon.txt \ --bpe-model $bbpe_dir/bbpe.model fi # Remove top-level files (if they were created) rm -f $lang_dir/lexicon.txt $lang_dir/L_disambig.pt done # Optional symlink if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then ln -s bbpe_2000 $lang_dir/bpe_2000 fi fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "Stage 6: Update cutset paths" python local/utils/update_cutset_paths.py fi log "prepare.sh: PREPARATION DONE"