diff --git a/egs/multi_zh-hans/ASR/README.md b/egs/multi_zh-hans/ASR/README.md index 1f0892a89..537816a5d 100644 --- a/egs/multi_zh-hans/ASR/README.md +++ b/egs/multi_zh-hans/ASR/README.md @@ -27,7 +27,7 @@ This recipe includes scripts for training Zipformer model using multiple Chinese |MagicData|755|https://www.openslr.org/68/| |AliMeeting|100|https://openslr.org/119/| |WeNetSpeech|10,000|https://github.com/wenet-e2e/WenetSpeech| -|KeSpeech|1,542|https://openreview.net/forum?id=b3Zoeq2sCLq| +|KeSpeech|1,542|https://github.com/KeSpeech/KeSpeech| # Included Test Sets diff --git a/egs/multi_zh-hans/ASR/local/compute_fbank_magicdata.py b/egs/multi_zh-hans/ASR/local/compute_fbank_magicdata.py index a0ea24d57..5649d3815 100755 --- a/egs/multi_zh-hans/ASR/local/compute_fbank_magicdata.py +++ b/egs/multi_zh-hans/ASR/local/compute_fbank_magicdata.py @@ -80,7 +80,7 @@ def compute_fbank_magicdata(num_mel_bins: int = 80, speed_perturb: bool = False) ) if "train" in partition and speed_perturb: cut_set = ( - (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)) + cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) ) cut_set = cut_set.compute_and_store_features( extractor=extractor, @@ -117,6 +117,6 @@ if __name__ == "__main__": logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() - compute_fbank_thchs30( + compute_fbank_magicdata( num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb ) diff --git a/egs/multi_zh-hans/ASR/local/compute_fbank_primewords.py b/egs/multi_zh-hans/ASR/local/compute_fbank_primewords.py index 32dd1d81a..13fdb036e 100755 --- a/egs/multi_zh-hans/ASR/local/compute_fbank_primewords.py +++ b/egs/multi_zh-hans/ASR/local/compute_fbank_primewords.py @@ -117,6 +117,6 @@ if __name__ == "__main__": logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() - compute_fbank_thchs30( + compute_fbank_primewords( num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb ) diff --git a/egs/multi_zh-hans/ASR/local/compute_fbank_stcmds.py b/egs/multi_zh-hans/ASR/local/compute_fbank_stcmds.py index 34442e787..730806954 100755 --- a/egs/multi_zh-hans/ASR/local/compute_fbank_stcmds.py +++ b/egs/multi_zh-hans/ASR/local/compute_fbank_stcmds.py @@ -80,7 +80,7 @@ def compute_fbank_stcmds(num_mel_bins: int = 80, speed_perturb: bool = False): ) if "train" in partition and speed_perturb: cut_set = ( - (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)) + cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) ) cut_set = cut_set.compute_and_store_features( extractor=extractor, @@ -116,6 +116,6 @@ if __name__ == "__main__": logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() - compute_fbank_thchs30( + compute_fbank_stcmds( num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb ) diff --git a/egs/multi_zh-hans/ASR/local/prepare_for_bpe_model.py b/egs/multi_zh-hans/ASR/local/prepare_for_bpe_model.py index 1d6934b61..328bb4809 100755 --- a/egs/multi_zh-hans/ASR/local/prepare_for_bpe_model.py +++ b/egs/multi_zh-hans/ASR/local/prepare_for_bpe_model.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Zengrui Jin) +# Copyright 2023 Xiaomi Corp. (authors: Zengrui Jin) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -15,10 +15,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +# This script tokenizes the training transcript by CJK characters +# and saves the result to transcript_chars.txt, which is used +# to train the BPE model later. + import argparse from pathlib import Path from tqdm.auto import tqdm + from icefall.utils import tokenize_by_CJK_char @@ -52,11 +57,8 @@ def main(): with open(text, "r", encoding="utf-8") as fin: text_lines = fin.readlines() - tokenized_lines = [] - for line in tqdm(text_lines, desc="Tokenizing training transcript"): - tokenized_lines.append(f"{tokenize_by_CJK_char(line)}\n") with open(transcript_path, "w+", encoding="utf-8") as fout: - fout.writelines(tokenized_lines) + fout.writelines([f"{tokenize_by_CJK_char(line)}\n" for line in text_lines]) if __name__ == "__main__": diff --git a/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py b/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py index 5d871a5c6..c434ead7e 100755 --- a/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py +++ b/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright 2021 Johns Hopkins University (Piotr Żelasko) # Copyright 2021 Xiaomi Corp. (Fangjun Kuang) +# Copyright 2023 Xiaomi Corp. (Zengrui Jin) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -32,7 +33,6 @@ from icefall import setup_logger def normalize_text( utt: str, - # punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"), punct_pattern=re.compile(r"<(PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"), whitespace_pattern=re.compile(r"\s\s+"), ) -> str: diff --git a/egs/multi_zh-hans/ASR/local/train_bpe_model.py b/egs/multi_zh-hans/ASR/local/train_bpe_model.py index b651fc290..976ea0ba8 100755 --- a/egs/multi_zh-hans/ASR/local/train_bpe_model.py +++ b/egs/multi_zh-hans/ASR/local/train_bpe_model.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# Copyright 2023 Xiaomi Corp. (authors: Zengrui Jin) # # See ../../../../LICENSE for clarification regarding multiple authors # diff --git a/egs/multi_zh-hans/ASR/prepare.sh b/egs/multi_zh-hans/ASR/prepare.sh index ccc1e5ea4..8bd52b599 100755 --- a/egs/multi_zh-hans/ASR/prepare.sh +++ b/egs/multi_zh-hans/ASR/prepare.sh @@ -5,7 +5,6 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail -nj=16 stage=-1 stop_stage=100 num_splits=100 @@ -256,11 +255,12 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then log "Stage 12: Prepare KeSpeech" if [ ! -d $dl_dir/KeSpeech ]; then log "Abort! Please download KeSpeech first." + log "KeSpeech download link: https://github.com/KeSpeech/KeSpeech" fi if [ ! -f data/manifests/.kespeech.done ]; then mkdir -p data/manifests - lhotse prepare kespeech -j $nj $dl_dir/KeSpeech data/manifests/kespeech + lhotse prepare kespeech -j 16 $dl_dir/KeSpeech data/manifests/kespeech touch data/manifests/.kespeech.done fi @@ -303,7 +303,7 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then fi if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then - log "Stage 13: BPE model training" + log "Stage 13: BPE model training (note that we use transcripts of wenetspeech only for BPE training)" ./local/prepare_for_bpe_model.py --lang-dir ./data/lang_char --text ./data/lang_char/text for vocab_size in ${vocab_sizes[@]}; do @@ -348,7 +348,7 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then fi if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then - log "Stage 14: Prepare G" + log "Stage 14: Prepare G (note that we use ngram lm of wenetspeech only for G preparation)" if [ -d ../../wenetspeech/ASR/data/lang_char/ ]; then cd data diff --git a/egs/multi_zh-hans/ASR/zipformer/asr_datamodule.py b/egs/multi_zh-hans/ASR/zipformer/asr_datamodule.py index 3518eee3f..b1b7bff93 100644 --- a/egs/multi_zh-hans/ASR/zipformer/asr_datamodule.py +++ b/egs/multi_zh-hans/ASR/zipformer/asr_datamodule.py @@ -322,7 +322,7 @@ class AsrDataModule: sampler=train_sampler, batch_size=None, num_workers=self.args.num_workers, - persistent_workers=False, + persistent_workers=True, worker_init_fn=worker_init_fn, )