From 94810ced457a6e5ae4b45e092eff62827d994a21 Mon Sep 17 00:00:00 2001
From: luomingshuang <739314837@qq.com>
Date: Wed, 2 Mar 2022 15:12:10 +0800
Subject: [PATCH] add md files and prepare.sh
---
egs/tedlium3/ASR/README.md | 18 +++
egs/tedlium3/ASR/RESULTS.md | 68 ++++++++++
egs/tedlium3/ASR/prepare.sh | 243 ++++++++++++++++++++++++++++++++++++
3 files changed, 329 insertions(+)
create mode 100644 egs/tedlium3/ASR/README.md
create mode 100644 egs/tedlium3/ASR/RESULTS.md
create mode 100644 egs/tedlium3/ASR/prepare.sh
diff --git a/egs/tedlium3/ASR/README.md b/egs/tedlium3/ASR/README.md
new file mode 100644
index 000000000..57bd9458b
--- /dev/null
+++ b/egs/tedlium3/ASR/README.md
@@ -0,0 +1,18 @@
+
+# Introduction
+
+This recipe includes some different ASR models trained with TedLium3.
+
+# Transducers
+
+There are various folders containing the name `transducer` in this folder.
+The following table lists the differences among them.
+
+| | Encoder | Decoder |
+|------------------------|-----------|--------------------|
+| `transducer_stateless` | Conformer | Embedding + Conv1d |
+
+
+The decoder in `transducer_stateless` is modified from the paper
+[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
+We place an additional Conv1d layer right after the input embedding layer.
diff --git a/egs/tedlium3/ASR/RESULTS.md b/egs/tedlium3/ASR/RESULTS.md
new file mode 100644
index 000000000..5b1beb6de
--- /dev/null
+++ b/egs/tedlium3/ASR/RESULTS.md
@@ -0,0 +1,68 @@
+## Results
+
+### TedLium3 BPE training results (Transducer)
+
+#### Conformer encoder + embedding decoder
+
+Using the codes from this PR commit https://github.com/k2-fsa/icefall/pull/183/commits/536ad2252e2d406f24a681743d98bd5f90801b97.
+
+Conformer encoder + non-current decoder. The decoder
+contains only an embedding layer and a Conv1d (with kernel size 2).
+
+The WERs are
+
+| | dev | test | comment |
+|------------------------------------|------------|------------|------------------------------------------|
+| greedy search | 7.19 | 6.57 | --epoch 29, --avg 16, --max-duration 100 |
+| beam search (beam size 4) | 7.12 | 6.37 | --epoch 29, --avg 16, --max-duration 100 |
+| modified beam search (beam size 4) | 7.00 | 6.19 | --epoch 29, --avg 16, --max-duration 100 |
+
+The training command for reproducing is given below:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./transducer_stateless/train.py \
+ --world-size 4 \
+ --num-epochs 30 \
+ --start-epoch 0 \
+ --exp-dir transducer_stateless/exp \
+ --max-duration 200 \
+```
+
+The tensorboard training log can be found at
+https://tensorboard.dev/experiment/DnRwoZF8RRyod4kkfG5q5Q/#scalars
+
+The decoding command is:
+```
+epoch=29
+avg=15
+
+## greedy search
+./transducer_stateless/decode.py \
+ --epoch $epoch \
+ --avg $avg \
+ --exp-dir transducer_stateless/exp \
+ --bpe-model ./data/lang_bpe_500/bpe.model \
+ --max-duration 100
+
+## beam search
+./transducer_stateless/decode.py \
+ --epoch $epoch \
+ --avg $avg \
+ --exp-dir transducer_stateless/exp \
+ --bpe-model ./data/lang_bpe_500/bpe.model \
+ --max-duration 100 \
+ --decoding-method beam_search \
+ --beam-size 4
+
+## modified beam search
+./transducer_stateless/decode.py \
+ --epoch $epoch \
+ --avg $avg \
+ --exp-dir transducer_stateless/exp \
+ --bpe-model ./data/lang_bpe_500/bpe.model \
+ --max-duration 100 \
+ --decoding-method beam_search \
+ --beam-size 4
+```
diff --git a/egs/tedlium3/ASR/prepare.sh b/egs/tedlium3/ASR/prepare.sh
new file mode 100644
index 000000000..9a643139f
--- /dev/null
+++ b/egs/tedlium3/ASR/prepare.sh
@@ -0,0 +1,243 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+nj=15
+stage=-1
+stop_stage=100
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+# - $dl_dir/tedlium3
+# You can find data, doc, legacy, LM, etc, inside it.
+# You can download them from https://www.openslr.org/51
+#
+# - $dl_dir/lm
+# This directory contains the language model(LM) downloaded from
+# https://huggingface.co/luomingshuang/tedlium3_lm. About how to get these LM files, you can know it
+# from https://github.com/luomingshuang/Train_LM_with_kaldilm.
+#
+# - lm_3_gram.arpa
+# - lm_4_gram.arpa
+#
+# - $dl_dir/musan
+# This directory contains the following directories downloaded from
+# http://www.openslr.org/17/
+#
+# - music
+# - noise
+# - speech
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# vocab size for sentence piece models.
+# It will generate data/lang_bpe_xxx,
+# data/lang_bpe_yyy if the array contains xxx, yyy
+vocab_sizes=(
+ 5000
+ 2000
+ 1000
+ 500
+)
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+ log "Stage -1: Download LM"
+ # We assume that you have installed the git-lfs, if not, you could install it
+ # using: `sudo apt-get install git-lfs && git-lfs install`
+ [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
+ git clone https://huggingface.co/luomingshuang/tedlium3_lm $dl_dir/lm
+ cd $dl_dir/lm && git lfs pull
+
+ # If you want to download Tedlium 4 gram language models
+ # using the follow commands:
+ #wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P $dl_dir/lm/ || exit 1
+ #wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P $dl_dir/lm/ || exit 1
+fi
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+ log "Stage 0: Download data"
+
+ # If you have pre-downloaded it to /path/to/LibriSpeech,
+ # you can create a symlink
+ #
+ # ln -sfv /path/to/tedlium3 $dl_dir/tedlium3
+ #
+ if [ ! -d $dl_dir/tedlium ]; then
+ lhotse download tedlium $dl_dir
+ fi
+
+ # If you have pre-downloaded it to /path/to/musan,
+ # you can create a symlink
+ #
+ #ln -sfv /path/to/musan $dl_dir/musan
+
+ if [ ! -d $dl_dir/musan ]; then
+ lhotse download musan $dl_dir
+ fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+ log "Stage 1: Prepare tedlium3 manifest"
+ # We assume that you have downloaded the tedlium3 corpus
+ # to $dl_dir/tedlium3
+ mkdir -p data/manifests
+ lhotse prepare tedlium $dl_dir/tedlium3 data/manifests
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+ log "Stage 2: Prepare musan manifest"
+ # We assume that you have downloaded the musan corpus
+ # to data/musan
+ mkdir -p data/manifests
+ lhotse prepare musan $dl_dir/musan data/manifests
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+ log "Stage 3: Compute fbank for tedlium3"
+ mkdir -p data/fbank
+ ./local/compute_fbank_tedlium.py
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+ log "Stage 4: Compute fbank for musan"
+ mkdir -p data/fbank
+ ./local/compute_fbank_musan.py
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+ log "Stage 5: Prepare phone based lang"
+ lang_dir=data/lang_phone
+ mkdir -p $lang_dir
+
+ if [ ! -f $lang_dir/train.text ]; then
+ ./local/prepare_transcripts.py \
+ --lang-dir $lang_dir \
+ --manifests-dir data/manifests
+
+ cat download/tedlium3/TEDLIUM.152k.dic |
+ grep -v -w "" |
+ grep -v -w "" |
+ grep -v -w "" |
+ LANG= LC_ALL= sort |
+ sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt
+
+ (echo ' '; ) |
+ cat - $lang_dir/lexicon_words.txt |
+ sort | uniq > $lang_dir/lexicon.txt
+
+ if [ ! -f $lang_dir/L_disambig.pt ]; then
+ ./local/prepare_lang.py --lang-dir $lang_dir
+ fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+ log "Stage 6: Prepare BPE based lang"
+
+ for vocab_size in ${vocab_sizes[@]}; do
+ lang_dir=data/lang_bpe_${vocab_size}
+ mkdir -p $lang_dir
+ # We reuse words.txt from phone based lexicon
+ # so that the two can share G.pt later.
+ cp data/lang_phone/words.txt $lang_dir
+
+ if [ ! -f $lang_dir/transcript_words.txt ]; then
+ log "Generate data for BPE training"
+ cat data/lang_phone/train.text | cut -d " " -f 2-
+ > $lang_dir/transcript_words.txt
+ # remove the for transcript_words.txt
+ sed -i 's/ //g' $lang_dir/transcript_words.txt
+ sed -i 's/ //g' $lang_dir/transcript_words.txt
+ sed -i 's///g' $lang_dir/transcript_words.txt
+ fi
+
+ ./local/train_bpe_model.py \
+ --lang-dir $lang_dir \
+ --vocab-size $vocab_size \
+ --transcript $lang_dir/transcript_words.txt
+
+ if [ ! -f $lang_dir/L_disambig.pt ]; then
+ ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+ fi
+ done
+fi
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+ log "Stage 7: Prepare bigram P"
+
+ for vocab_size in ${vocab_sizes[@]}; do
+ lang_dir=data/lang_bpe_${vocab_size}
+
+ if [ ! -f $lang_dir/transcript_tokens.txt ]; then
+ ./local/convert_transcript_words_to_tokens.py \
+ --lexicon $lang_dir/lexicon.txt \
+ --transcript $lang_dir/transcript_words.txt \
+ --oov "" \
+ > $lang_dir/transcript_tokens.txt
+ fi
+
+ if [ ! -f $lang_dir/P.arpa ]; then
+ ./shared/make_kn_lm.py \
+ -ngram-order 2 \
+ -text $lang_dir/transcript_tokens.txt \
+ -lm $lang_dir/P.arpa
+ fi
+
+ if [ ! -f $lang_dir/P.fst.txt ]; then
+ python3 -m kaldilm \
+ --read-symbol-table="$lang_dir/tokens.txt" \
+ --disambig-symbol='#0' \
+ --max-order=2 \
+ $lang_dir/P.arpa > $lang_dir/P.fst.txt
+ fi
+ done
+fi
+
+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
+ log "Stage 8: Prepare G"
+ # We assume you have install kaldilm, if not, please install
+ # it using: pip install kaldilm
+
+ mkdir -p data/lm
+ if [ ! -f data/lm/G_3_gram.fst.txt ]; then
+ # It is used in building HLG
+ python3 -m kaldilm \
+ --read-symbol-table="data/lang_phone/words.txt" \
+ --disambig-symbol='#0' \
+ --max-order=3 \
+ data/lm/lm_3_gram.arpa > data/lm/G_3_gram.fst.txt
+ fi
+
+ if [ ! -f data/lm/G_4_gram.fst.txt ]; then
+ # It is used for LM rescoring
+ python3 -m kaldilm \
+ --read-symbol-table="data/lang_phone/words.txt" \
+ --disambig-symbol='#0' \
+ --max-order=4 \
+ data/lm/lm_4_gram.arpa > data/lm/G_4_gram.fst.txt
+ fi
+fi
+echo 'completing the G building....'
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+ log "Stage 9: Compile HLG"
+ ./local/compile_hlg.py --lang-dir data/lang_phone
+
+ for vocab_size in ${vocab_sizes[@]}; do
+ lang_dir=data/lang_bpe_${vocab_size}
+ ./local/compile_hlg.py --lang-dir $lang_dir
+ done
+fi