minor fixes

2025-09-08 08:34:19 +00:00 · 2023-09-02 17:41:57 +08:00 · 2023-09-02 17:41:57 +08:00 · 9d25e6e7f0
commit 9d25e6e7f0
parent 28a361795f
9 changed files with 20 additions and 17 deletions
--- a/egs/multi_zh-hans/ASR/README.md
+++ b/egs/multi_zh-hans/ASR/README.md
@ -27,7 +27,7 @@ This recipe includes scripts for training Zipformer model using multiple Chinese
 |MagicData|755|https://www.openslr.org/68/|
 |AliMeeting|100|https://openslr.org/119/|
 |WeNetSpeech|10,000|https://github.com/wenet-e2e/WenetSpeech|
-|KeSpeech|1,542|https://openreview.net/forum?id=b3Zoeq2sCLq|
+|KeSpeech|1,542|https://github.com/KeSpeech/KeSpeech|


 # Included Test Sets
--- a/egs/multi_zh-hans/ASR/local/compute_fbank_magicdata.py
+++ b/egs/multi_zh-hans/ASR/local/compute_fbank_magicdata.py
@ -80,7 +80,7 @@ def compute_fbank_magicdata(num_mel_bins: int = 80, speed_perturb: bool = False)
            )
            if "train" in partition and speed_perturb:
                cut_set = (
-                    (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1))
+                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
@ -117,6 +117,6 @@ if __name__ == "__main__":
    logging.basicConfig(format=formatter, level=logging.INFO)

    args = get_args()
-    compute_fbank_thchs30(
+    compute_fbank_magicdata(
        num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb
    )
--- a/egs/multi_zh-hans/ASR/local/compute_fbank_primewords.py
+++ b/egs/multi_zh-hans/ASR/local/compute_fbank_primewords.py
@ -117,6 +117,6 @@ if __name__ == "__main__":
    logging.basicConfig(format=formatter, level=logging.INFO)

    args = get_args()
-    compute_fbank_thchs30(
+    compute_fbank_primewords(
        num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb
    )
--- a/egs/multi_zh-hans/ASR/local/compute_fbank_stcmds.py
+++ b/egs/multi_zh-hans/ASR/local/compute_fbank_stcmds.py
@ -80,7 +80,7 @@ def compute_fbank_stcmds(num_mel_bins: int = 80, speed_perturb: bool = False):
            )
            if "train" in partition and speed_perturb:
                cut_set = (
-                    (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1))
+                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
@ -116,6 +116,6 @@ if __name__ == "__main__":
    logging.basicConfig(format=formatter, level=logging.INFO)

    args = get_args()
-    compute_fbank_thchs30(
+    compute_fbank_stcmds(
        num_mel_bins=args.num_mel_bins, speed_perturb=args.speed_perturb
    )
--- a/egs/multi_zh-hans/ASR/local/prepare_for_bpe_model.py
+++ b/egs/multi_zh-hans/ASR/local/prepare_for_bpe_model.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Zengrui Jin)
+# Copyright    2023  Xiaomi Corp.        (authors: Zengrui Jin)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -15,10 +15,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# This script tokenizes the training transcript by CJK characters
+# and saves the result to transcript_chars.txt, which is used
+# to train the BPE model later.
+
 import argparse
 from pathlib import Path

 from tqdm.auto import tqdm
+
 from icefall.utils import tokenize_by_CJK_char


@ -52,11 +57,8 @@ def main():

    with open(text, "r", encoding="utf-8") as fin:
        text_lines = fin.readlines()
-    tokenized_lines = []
-    for line in tqdm(text_lines, desc="Tokenizing training transcript"):
-        tokenized_lines.append(f"{tokenize_by_CJK_char(line)}\n")
    with open(transcript_path, "w+", encoding="utf-8") as fout:
-        fout.writelines(tokenized_lines)
+        fout.writelines([f"{tokenize_by_CJK_char(line)}\n" for line in text_lines])


 if __name__ == "__main__":
--- a/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py
+++ b/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
+# Copyright    2023  Xiaomi Corp.             (Zengrui Jin)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -32,7 +33,6 @@ from icefall import setup_logger

 def normalize_text(
    utt: str,
-    # punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
    punct_pattern=re.compile(r"<(PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
    whitespace_pattern=re.compile(r"\s\s+"),
 ) -> str:
--- a/egs/multi_zh-hans/ASR/local/train_bpe_model.py
+++ b/egs/multi_zh-hans/ASR/local/train_bpe_model.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+# Copyright    2023  Xiaomi Corp.        (authors: Zengrui Jin)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
--- a/egs/multi_zh-hans/ASR/prepare.sh
+++ b/egs/multi_zh-hans/ASR/prepare.sh
@ -5,7 +5,6 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

 set -eou pipefail

-nj=16
 stage=-1
 stop_stage=100
 num_splits=100
@ -256,11 +255,12 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
  log "Stage 12: Prepare KeSpeech"
  if [ ! -d $dl_dir/KeSpeech ]; then
    log "Abort! Please download KeSpeech first."
+    log "KeSpeech download link: https://github.com/KeSpeech/KeSpeech"
  fi

  if [ ! -f data/manifests/.kespeech.done ]; then
    mkdir -p data/manifests
-    lhotse prepare kespeech -j $nj $dl_dir/KeSpeech data/manifests/kespeech 
+    lhotse prepare kespeech -j 16 $dl_dir/KeSpeech data/manifests/kespeech 
    touch data/manifests/.kespeech.done
  fi

@ -303,7 +303,7 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
 fi

 if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
-  log "Stage 13: BPE model training"
+  log "Stage 13: BPE model training (note that we use transcripts of wenetspeech only for BPE training)"
  ./local/prepare_for_bpe_model.py --lang-dir ./data/lang_char --text ./data/lang_char/text

  for vocab_size in ${vocab_sizes[@]}; do
@ -348,7 +348,7 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
 fi

 if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
-  log "Stage 14: Prepare G"
+  log "Stage 14: Prepare G (note that we use ngram lm of wenetspeech only for G preparation)"
  
  if [ -d ../../wenetspeech/ASR/data/lang_char/ ]; then
    cd data
--- a/egs/multi_zh-hans/ASR/zipformer/asr_datamodule.py
+++ b/egs/multi_zh-hans/ASR/zipformer/asr_datamodule.py
@ -322,7 +322,7 @@ class AsrDataModule:
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
-            persistent_workers=False,
+            persistent_workers=True,
            worker_init_fn=worker_init_fn,
        )