Commit more scripts for wenetspeech kws recipe

2025-12-11 06:55:27 +00:00 · 2024-02-02 12:18:06 +08:00 · 2024-02-02 12:18:06 +08:00 · 8b65f4138b
commit 8b65f4138b
parent 4b3356307a
10 changed files with 2353 additions and 147 deletions
--- a/egs/gigaspeech/KWS/zipformer/finetune.py
+++ b/egs/gigaspeech/KWS/zipformer/finetune.py
@ -158,6 +158,13 @@ def get_parser():
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    add_training_arguments(parser)
    add_model_arguments(parser)
    add_finetune_arguments(parser)
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -407,10 +407,3 @@ class WenetSpeechAsrDataModule:
    def test_meeting_cuts(self) -> List[CutSet]:
        logging.info("About to get TEST_MEETING cuts")
        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz")
    @lru_cache()
    def test_open_commands_cuts(self) -> CutSet:
        logging.info("About to get open commands cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "open-commands-cn_cuts_test.jsonl.gz"
        )
--- a/egs/wenetspeech/KWS/shared
+++ b/egs/wenetspeech/KWS/shared
@ -0,0 +1 @@
 ../../../icefall/shared
--- a/egs/wenetspeech/KWS/zipformer/asr_datamodule.py
+++ b/egs/wenetspeech/KWS/zipformer/asr_datamodule.py
@ -1,4 +1,5 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2024  Xiaomi Corporation     (Author: Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -409,8 +410,50 @@ class WenetSpeechAsrDataModule:
        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz")
    @lru_cache()
-    def test_open_commands_cuts(self) -> CutSet:
+    def cn_speech_commands_small_cuts(self) -> CutSet:
-        logging.info("About to get open commands cuts")
+        logging.info("About to get cn speech commands small cuts")
        return load_manifest_lazy(
-            self.args.manifest_dir / "open-commands-cn_cuts_test.jsonl.gz"
+            self.args.manifest_dir / "cn_speech_commands_cuts_small.jsonl.gz"
        )
    @lru_cache()
    def cn_speech_commands_large_cuts(self) -> CutSet:
        logging.info("About to get cn speech commands large cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "cn_speech_commands_cuts_large.jsonl.gz"
        )
    @lru_cache()
    def nihaowenwen_dev_cuts(self) -> CutSet:
        logging.info("About to get nihaowenwen dev cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "nihaowenwen_cuts_dev.jsonl.gz"
        )
    @lru_cache()
    def nihaowenwen_test_cuts(self) -> CutSet:
        logging.info("About to get nihaowenwen test cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "nihaowenwen_cuts_test.jsonl.gz"
        )
    @lru_cache()
    def nihaowenwen_train_cuts(self) -> CutSet:
        logging.info("About to get nihaowenwen train cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "nihaowenwen_cuts_train.jsonl.gz"
        )
    @lru_cache()
    def xiaoyun_clean_cuts(self) -> CutSet:
        logging.info("About to get xiaoyun clean cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "xiaoyun_cuts_clean.jsonl.gz"
        )
    @lru_cache()
    def xiaoyun_noisy_cuts(self) -> CutSet:
        logging.info("About to get xiaoyun noisy cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "xiaoyun_cuts_noisy.jsonl.gz"
        )
--- a/egs/wenetspeech/KWS/zipformer/decode_pinyin.py
+++ b/egs/wenetspeech/KWS/zipformer/decode_pinyin.py
@ -178,16 +178,47 @@ def get_parser():
    )
    parser.add_argument(
-        "--keyword-file",
+        "--keywords-file",
        type=str,
        help="File contains keywords.",
    )
    parser.add_argument(
-        "--keyword-score",
+        "--test-set",
        type=str,
        default="small",
        help="small or large",
    )
    parser.add_argument(
        "--keywords-score",
        type=float,
-        default=0.75,
+        default=1.5,
-        help="The threshold (probability) to boost the keyword.",
+        help="""
        The default boosting score (token level) for keywords. it will boost the
        paths that match keywords to make them survive beam search.
        """,
    )
    parser.add_argument(
        "--keywords-threshold",
        type=float,
        default=0.35,
        help="The default threshold (probability) to trigger the keyword.",
    )
    parser.add_argument(
        "--keywords-version",
        type=str,
        default="",
        help="The keywords configuration version, just to save results to different files.",
    )
    parser.add_argument(
        "--num-tailing-blanks",
        type=int,
        default=1,
        help="The number of tailing blanks should have after hitting one keyword.",
    )
    add_model_arguments(parser)
@ -261,7 +292,7 @@ def decode_one_batch(
        model=model,
        encoder_out=encoder_out,
        encoder_out_lens=encoder_out_lens,
-        context_graph=kws_graph,
+        keywords_graph=kws_graph,
        beam=params.beam_size,
        num_tailing_blanks=8,
    )
@ -288,6 +319,7 @@ def decode_dataset(
    lexicon: Lexicon,
    kws_graph: ContextGraph,
    keywords: Set[str],
    test_only_keywords: bool,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
@ -342,32 +374,62 @@ def decode_dataset(
            hyp_words = [x[0] for x in hyp_words]
            this_batch.append((cut_id, ref_words, list("".join(hyp_words))))
            hyp_set = set(hyp_words)
-            hyp_str = " | ".join(hyp_words)
+            if len(hyp_words) > 1:
                logging.warning(
                    f"Cut {cut_id} triggers more than one keywords : {hyp_words},"
                    f"please check the transcript to see if it really has more "
                    f"than one keywords, if so consider splitting this audio and"
                    f"keep only one keyword for each audio."
                )
            hyp_str = " | ".join(
                hyp_words
            )  # The triggered keywords for this utterance.
            TP = False
            FP = False
            for x in hyp_set:
-                assert x in keywords, x
+                assert x in keywords, x  # can only trigger keywords
-                if x in ref_text and x in keywords:
+                if (test_only_keywords and x == ref_text) or (
-                    metric["all"].TP += 1
+                    not test_only_keywords and x in ref_text
                ):
                    TP = True
                    metric[x].TP += 1
                    metric[x].TP_list.append(f"({ref_text} -> {x})")
-                if x not in ref_text and x in keywords:
+                if (test_only_keywords and x != ref_text) or (
-                    metric["all"].FP += 1
+                    not test_only_keywords and x not in ref_text
                ):
                    FP = True
                    metric[x].FP += 1
-                    metric[x].FP_list.append(f"({ref_text} -> {x}/{cut_id})")
+                    metric[x].FP_list.append(f"({ref_text} -> {x})")
            if TP:
                metric["all"].TP += 1
            if FP:
                metric["all"].FP += 1
            TN = True  # all keywords are true negative then the summery is true negative.
            FN = False
            for x in keywords:
                if x not in ref_text and x not in hyp_set:
                    metric["all"].TN += 1
                    metric[x].TN += 1
                    continue
-                if x in ref_text:
+                TN = False
                if (test_only_keywords and x == ref_text) or (
                    not test_only_keywords and x in ref_text
                ):
                    fn = True
                    for y in hyp_set:
-                        if y in ref_text:
+                        if (test_only_keywords and y == ref_text) or (
                            not test_only_keywords and y in ref_text
                        ):
                            fn = False
                            break
-                    if fn and ref_text.endswith(x):
+                    if fn:
-                        metric["all"].FN += 1
+                        FN = True
                        metric[x].FN += 1
-                        metric[x].FN_list.append(f"({ref_text} -> {hyp_str}/{cut_id})")
+                        metric[x].FN_list.append(f"({ref_text} -> {hyp_str})")
            if TN:
                metric["all"].TN += 1
            if FN:
                metric["all"].FN += 1
        results.extend(this_batch)
@ -399,16 +461,17 @@ def save_results(
    metric_filename = params.res_dir / f"metric-{test_set_name}-{params.suffix}.txt"
    print_s = ""
    with open(metric_filename, "w") as of:
        width = 10
        for key, item in sorted(
            metric.items(), key=lambda x: (x[1].FP, x[1].FN), reverse=True
        ):
            acc = (item.TP + item.TN) / (item.TP + item.TN + item.FP + item.FN)
-            precision = (item.TP + 1) / (item.TP + item.FP + 1)
+            precision = (
-            recall = (item.TP + 1) / (item.TP + item.FN + 1)
+                0.0 if (item.TP + item.FP) == 0 else item.TP / (item.TP + item.FP)
-            fpr = (item.FP + 1) / (item.FP + item.TN + 1)
+            )
            recall = 0.0 if (item.TP + item.FN) == 0 else item.TP / (item.TP + item.FN)
            fpr = 0.0 if (item.FP + item.TN) == 0 else item.FP / (item.FP + item.TN)
            s = f"{key}:\n"
            s += f"\t{'TP':{width}}{'FP':{width}}{'FN':{width}}{'TN':{width}}\n"
            s += f"\t{str(item.TP):{width}}{str(item.FP):{width}}{str(item.FN):{width}}{str(item.TN):{width}}\n"
@ -417,12 +480,14 @@ def save_results(
            s += f"\tRecall(PPR): {recall:.3f}\n"
            s += f"\tFPR: {fpr:.3f}\n"
            s += f"\tF1: {2 * precision * recall / (precision + recall):.3f}\n"
-            s += f"\tTP list: {' # '.join(item.TP_list)}\n"
+            if key != "all":
-            s += f"\tFP list: {' # '.join(item.FP_list)}\n"
+                s += f"\tTP list: {' # '.join(item.TP_list)}\n"
-            s += f"\tFN list: {' # '.join(item.FN_list)}\n"
+                s += f"\tFP list: {' # '.join(item.FP_list)}\n"
                s += f"\tFN list: {' # '.join(item.FN_list)}\n"
            of.write(s + "\n")
            if key == "all":
                logging.info(s)
        of.write(f"\n\n{params.keywords_config}")
    logging.info("Wrote metric stats to {}".format(metric_filename))
@ -439,6 +504,7 @@ def main():
    params.res_dir = params.exp_dir / "kws"
    params.suffix = params.test_set
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
@ -454,9 +520,12 @@ def main():
        params.suffix += f"-chunk-{params.chunk_size}"
        params.suffix += f"-left-context-{params.left_context_frames}"
-    if params.use_averaged_model:
+    params.suffix += f"-score-{params.keywords_score}"
-        params.suffix += "-use-averaged-model"
+    params.suffix += f"-threshold-{params.keywords_threshold}"
-    params.suffix += f"-keyword-score-{params.keyword_score}"
+    params.suffix += f"-tailing-blanks-{params.num_tailing_blanks}"
    if params.blank_penalty != 0:
        params.suffix += f"-blank-penalty-{params.blank_penalty}"
    params.suffix += f"-version-{params.keywords_version}"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
@ -473,18 +542,30 @@ def main():
    logging.info(params)
-    keywords = []
+    phrases = []
-    keywords_id = []
+    token_ids = []
-    with open(params.keyword_file, "r") as f:
+    keywords_scores = []
    keywords_thresholds = []
    keywords_config = []
    with open(params.keywords_file, "r") as f:
        for line in f.readlines():
            keywords_config.append(line)
            score = 0
-            kws = line.strip().upper().split()
+            threshold = 0
-            if kws[-1][0] == ":":
+            keyword = []
-                score = float(kws[-1][1:])
+            words = line.strip().upper().split()
-                kws = kws[0:-1]
+            for word in words:
                word = word.strip()
                if word[0] == ":":
                    score = float(word[1:])
                    continue
                if word[0] == "#":
                    threshold = float(word[1:])
                    continue
                keyword.append(word)
            keyword = "".join(keyword)
            tmp_ids = []
-            kws = "".join(kws)
+            kws_py = text_to_pinyin(keyword, mode=params.pinyin_type)
            kws_py = text_to_pinyin(kws, mode=params.pinyin_type)
            for k in kws_py:
                if k in lexicon.token_table:
                    tmp_ids.append(lexicon.token_table[k])
@ -493,11 +574,23 @@ def main():
                    tmp_ids = []
                    break
            if tmp_ids:
-                logging.info(f"Adding keyword : {kws}")
+                logging.info(f"Adding keyword : {keyword}")
-                keywords.append(kws)
+                phrases.append(keyword)
-                keywords_id.append((tmp_ids, score, kws))
+                token_ids.append(tmp_ids)
-    kws_graph = ContextGraph(context_score=params.keyword_score)
+                keywords_scores.append(score)
-    kws_graph.build(keywords_id)
+                keywords_thresholds.append(threshold)
    params.keywords_config = "".join(keywords_config)
    kws_graph = ContextGraph(
        context_score=params.keywords_score, ac_threshold=params.keywords_threshold
    )
    kws_graph.build(
        token_ids=token_ids,
        phrases=phrases,
        scores=keywords_scores,
        ac_thresholds=keywords_thresholds,
    )
    keywords = set(phrases)
    logging.info("About to create model")
    model = get_model(params)
@ -597,21 +690,51 @@ def main():
            )
        return T > 0
    def select_keywords(c: Cut):
        text = c.supervisions[0].text.strip()
        return text in keywords
    commands_cuts = wenetspeech.test_open_commands_cuts()
    commands_cuts = commands_cuts.filter(select_keywords)
    commands_cuts = commands_cuts.filter(remove_short_utt)
    commands_dl = wenetspeech.test_dataloaders(commands_cuts)
    test_net_cuts = wenetspeech.test_net_cuts()
    test_net_cuts = test_net_cuts.filter(remove_short_utt)
    test_net_dl = wenetspeech.test_dataloaders(test_net_cuts)
-    test_sets = ["COMMANDS"]  # , "TEST_NET"]
+    cn_commands_small_cuts = wenetspeech.cn_speech_commands_small_cuts()
-    test_dls = [commands_dl]  # , test_net_dl]
+    cn_commands_small_cuts = cn_commands_small_cuts.filter(remove_short_utt)
    cn_commands_small_dl = wenetspeech.test_dataloaders(cn_commands_small_cuts)
    cn_commands_large_cuts = wenetspeech.cn_speech_commands_large_cuts()
    cn_commands_large_cuts = cn_commands_large_cuts.filter(remove_short_utt)
    cn_commands_large_dl = wenetspeech.test_dataloaders(cn_commands_large_cuts)
    nihaowenwen_test_cuts = wenetspeech.nihaowenwen_test_cuts()
    nihaowenwen_test_cuts = nihaowenwen_test_cuts.filter(remove_short_utt)
    nihaowenwen_test_dl = wenetspeech.test_dataloaders(nihaowenwen_test_cuts)
    xiaoyun_clean_cuts = wenetspeech.xiaoyun_clean_cuts()
    xiaoyun_clean_cuts = xiaoyun_clean_cuts.filter(remove_short_utt)
    xiaoyun_clean_dl = wenetspeech.test_dataloaders(xiaoyun_clean_cuts)
    xiaoyun_noisy_cuts = wenetspeech.xiaoyun_noisy_cuts()
    xiaoyun_noisy_cuts = xiaoyun_noisy_cuts.filter(remove_short_utt)
    xiaoyun_noisy_dl = wenetspeech.test_dataloaders(xiaoyun_noisy_cuts)
    test_sets = []
    test_dls = []
    if params.test_set == "large":
        test_sets.append("cn_commands_large")
        test_dls.append(cn_commands_large_dl)
    else:
        assert params.test_set == "small", params.test_set
        test_sets += [
            "cn_commands_small",
            "nihaowenwen",
            "xiaoyun_clean",
            "xiaoyun_noisy",
            "test_net",
        ]
        test_dls += [
            cn_commands_small_dl,
            nihaowenwen_test_dl,
            xiaoyun_clean_dl,
            xiaoyun_noisy_dl,
            test_net_dl,
        ]
    for test_set, test_dl in zip(test_sets, test_dls):
        results, metric = decode_dataset(
@ -620,7 +743,8 @@ def main():
            model=model,
            lexicon=lexicon,
            kws_graph=kws_graph,
-            keywords=set(keywords),
+            keywords=keywords,
            test_only_keywords="test_net" not in test_set,
        )
        save_results(
--- a/egs/wenetspeech/KWS/zipformer/export.py
+++ b/egs/wenetspeech/KWS/zipformer/export.py
@ -0,0 +1,526 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
 #                                                 Zengwei Yao,
 #                                                 Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts several saved checkpoints
 # to a single one using model averaging.
 """
 Usage:
 Note: This is a example for librispeech dataset, if you are using different
 dataset, you should change the argument values according to your dataset.
 (1) Export to torchscript model using torch.jit.script()
 - For non-streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --tokens data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 9 \
  --jit 1
 It will generate a file `jit_script.pt` in the given `exp_dir`. You can later
 load it by `torch.jit.load("jit_script.pt")`.
 Check ./jit_pretrained.py for its usage.
 Check https://github.com/k2-fsa/sherpa
 for how to use the exported models outside of icefall.
 - For streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --causal 1 \
  --chunk-size 16 \
  --left-context-frames 128 \
  --tokens data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 9 \
  --jit 1
 It will generate a file `jit_script_chunk_16_left_128.pt` in the given `exp_dir`.
 You can later load it by `torch.jit.load("jit_script_chunk_16_left_128.pt")`.
 Check ./jit_pretrained_streaming.py for its usage.
 Check https://github.com/k2-fsa/sherpa
 for how to use the exported models outside of icefall.
 (2) Export `model.state_dict()`
 - For non-streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --tokens data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 9
 - For streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --causal 1 \
  --tokens data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 9
 It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
 load it by `icefall.checkpoint.load_checkpoint()`.
 - For non-streaming model:
 To use the generated file with `zipformer/decode.py`,
 you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/librispeech/ASR
    ./zipformer/decode.py \
        --exp-dir ./zipformer/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 600 \
        --decoding-method greedy_search \
        --bpe-model data/lang_bpe_500/bpe.model
 - For streaming model:
 To use the generated file with `zipformer/decode.py` and `zipformer/streaming_decode.py`, you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/librispeech/ASR
    # simulated streaming decoding
    ./zipformer/decode.py \
        --exp-dir ./zipformer/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 600 \
        --causal 1 \
        --chunk-size 16 \
        --left-context-frames 128 \
        --decoding-method greedy_search \
        --bpe-model data/lang_bpe_500/bpe.model
    # chunk-wise streaming decoding
    ./zipformer/streaming_decode.py \
        --exp-dir ./zipformer/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 600 \
        --causal 1 \
        --chunk-size 16 \
        --left-context-frames 128 \
        --decoding-method greedy_search \
        --bpe-model data/lang_bpe_500/bpe.model
 Check ./pretrained.py for its usage.
 Note: If you don't want to train a model from scratch, we have
 provided one for you. You can get it at
 - non-streaming model:
 https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
 - streaming model:
 https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
 with the following commands:
    sudo apt-get install git-lfs
    git lfs install
    git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
    git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
    # You will find the pre-trained models in exp dir
 """
 import argparse
 import logging
 from pathlib import Path
 from typing import List, Tuple
 import k2
 import torch
 from scaling_converter import convert_scaled_to_non_scaled
 from torch import Tensor, nn
 from train import add_model_arguments, get_model, get_params
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import make_pad_mask, num_tokens, str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=30,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=9,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="zipformer/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--tokens",
        type=str,
        default="data/lang_bpe_500/tokens.txt",
        help="Path to the tokens.txt",
    )
    parser.add_argument(
        "--jit",
        type=str2bool,
        default=False,
        help="""True to save a model after applying torch.jit.script.
        It will generate a file named jit_script.pt.
        Check ./jit_pretrained.py for how to use it.
        """,
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    add_model_arguments(parser)
    return parser
 class EncoderModel(nn.Module):
    """A wrapper for encoder and encoder_embed"""
    def __init__(self, encoder: nn.Module, encoder_embed: nn.Module) -> None:
        super().__init__()
        self.encoder = encoder
        self.encoder_embed = encoder_embed
    def forward(
        self, features: Tensor, feature_lengths: Tensor
    ) -> Tuple[Tensor, Tensor]:
        """
        Args:
            features: (N, T, C)
            feature_lengths: (N,)
        """
        x, x_lens = self.encoder_embed(features, feature_lengths)
        src_key_padding_mask = make_pad_mask(x_lens)
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
        encoder_out, encoder_out_lens = self.encoder(x, x_lens, src_key_padding_mask)
        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
        return encoder_out, encoder_out_lens
 class StreamingEncoderModel(nn.Module):
    """A wrapper for encoder and encoder_embed"""
    def __init__(self, encoder: nn.Module, encoder_embed: nn.Module) -> None:
        super().__init__()
        assert len(encoder.chunk_size) == 1, encoder.chunk_size
        assert len(encoder.left_context_frames) == 1, encoder.left_context_frames
        self.chunk_size = encoder.chunk_size[0]
        self.left_context_len = encoder.left_context_frames[0]
        # The encoder_embed subsample features (T - 7) // 2
        # The ConvNeXt module needs (7 - 1) // 2 = 3 frames of right padding after subsampling
        self.pad_length = 7 + 2 * 3
        self.encoder = encoder
        self.encoder_embed = encoder_embed
    def forward(
        self, features: Tensor, feature_lengths: Tensor, states: List[Tensor]
    ) -> Tuple[Tensor, Tensor, List[Tensor]]:
        """Streaming forward for encoder_embed and encoder.
        Args:
            features: (N, T, C)
            feature_lengths: (N,)
            states: a list of Tensors
        Returns encoder outputs, output lengths, and updated states.
        """
        chunk_size = self.chunk_size
        left_context_len = self.left_context_len
        cached_embed_left_pad = states[-2]
        x, x_lens, new_cached_embed_left_pad = self.encoder_embed.streaming_forward(
            x=features,
            x_lens=feature_lengths,
            cached_left_pad=cached_embed_left_pad,
        )
        assert x.size(1) == chunk_size, (x.size(1), chunk_size)
        src_key_padding_mask = make_pad_mask(x_lens)
        # processed_mask is used to mask out initial states
        processed_mask = torch.arange(left_context_len, device=x.device).expand(
            x.size(0), left_context_len
        )
        processed_lens = states[-1]  # (batch,)
        # (batch, left_context_size)
        processed_mask = (processed_lens.unsqueeze(1) <= processed_mask).flip(1)
        # Update processed lengths
        new_processed_lens = processed_lens + x_lens
        # (batch, left_context_size + chunk_size)
        src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
        encoder_states = states[:-2]
        (
            encoder_out,
            encoder_out_lens,
            new_encoder_states,
        ) = self.encoder.streaming_forward(
            x=x,
            x_lens=x_lens,
            states=encoder_states,
            src_key_padding_mask=src_key_padding_mask,
        )
        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
        new_states = new_encoder_states + [
            new_cached_embed_left_pad,
            new_processed_lens,
        ]
        return encoder_out, encoder_out_lens, new_states
    @torch.jit.export
    def get_init_states(
        self,
        batch_size: int = 1,
        device: torch.device = torch.device("cpu"),
    ) -> List[torch.Tensor]:
        """
        Returns a list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
        is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
        states[-2] is the cached left padding for ConvNeXt module,
        of shape (batch_size, num_channels, left_pad, num_freqs)
        states[-1] is processed_lens of shape (batch,), which records the number
        of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
        """
        states = self.encoder.get_init_states(batch_size, device)
        embed_states = self.encoder_embed.get_init_states(batch_size, device)
        states.append(embed_states)
        processed_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
        states.append(processed_lens)
        return states
@torch.no_grad()
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    # if torch.cuda.is_available():
    #     device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    token_table = k2.SymbolTable.from_file(params.tokens)
    params.blank_id = token_table["<blk>"]
    params.vocab_size = num_tokens(token_table) + 1
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.eval()
    if params.jit is True:
        convert_scaled_to_non_scaled(model, inplace=True)
        # We won't use the forward() method of the model in C++, so just ignore
        # it here.
        # Otherwise, one of its arguments is a ragged tensor and is not
        # torch scriptabe.
        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
        # Wrap encoder and encoder_embed as a module
        if params.causal:
            model.encoder = StreamingEncoderModel(model.encoder, model.encoder_embed)
            chunk_size = model.encoder.chunk_size
            left_context_len = model.encoder.left_context_len
            filename = f"jit_script_chunk_{chunk_size}_left_{left_context_len}.pt"
        else:
            model.encoder = EncoderModel(model.encoder, model.encoder_embed)
            filename = "jit_script.pt"
        logging.info("Using torch.jit.script")
        model = torch.jit.script(model)
        model.save(str(params.exp_dir / filename))
        logging.info(f"Saved to {filename}")
    else:
        logging.info("Not using torchscript. Export model.state_dict()")
        # Save it using a format so that it can be loaded
        # by :func:`load_checkpoint`
        filename = params.exp_dir / "pretrained.pt"
        torch.save({"model": model.state_dict()}, str(filename))
        logging.info(f"Saved to {filename}")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/wenetspeech/KWS/zipformer/train_pinyin.py
+++ b/egs/wenetspeech/KWS/zipformer/train_pinyin.py
@ -3,6 +3,7 @@
 #                                                       Wei Kang,
 #                                                       Mingshuang Luo,
 #                                                       Zengwei Yao,
 #                                                       Yifan Yang,
 #                                                       Daniel Povey)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
@ -23,29 +24,44 @@ Usage:
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
-# For non-streaming model training:
+# For non-streaming model finetuning:
-./zipformer/train.py \
+./zipformer/finetune.py \
  --world-size 4 \
-  --num-epochs 30 \
+  --num-epochs 10 \
  --start-epoch 1 \
  --use-fp16 1 \
  --exp-dir zipformer/exp \
  --max-duration 1000
-# For streaming model training:
+# For non-streaming model finetuning with mux (original dataset):
-./zipformer/train.py \
+./zipformer/finetune.py \
  --world-size 4 \
-  --num-epochs 30 \
+  --num-epochs 10 \
  --start-epoch 1 \
  --use-mux 1 \
  --use-fp16 1 \
  --exp-dir zipformer/exp \
  --max-duration 1000
 # For streaming model finetuning:
 ./zipformer/fintune.py \
  --world-size 4 \
  --num-epochs 10 \
  --start-epoch 1 \
  --use-fp16 1 \
  --exp-dir zipformer/exp \
  --causal 1 \
  --max-duration 1000
-It supports training with:
+# For streaming model finetuning with mux (original dataset):
-  - transducer loss (default), with `--use-transducer True --use-ctc False`
+./zipformer/fintune.py \
-  - ctc loss (not recommended), with `--use-transducer False --use-ctc True`
+  --world-size 4 \
-  - transducer loss & ctc loss, with `--use-transducer True --use-ctc True`
+  --num-epochs 10 \
  --start-epoch 1 \
  --use-fp16 1 \
  --exp-dir zipformer/exp \
  --causal 1 \
  --max-duration 1000
 """
@ -55,7 +71,7 @@ import logging
 import warnings
 from pathlib import Path
 from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import k2
 import optim
@ -63,12 +79,10 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import WenetSpeechAsrDataModule
-from lhotse.cut import Cut
+from lhotse.cut import Cut, CutSet
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from model import AsrModel
 from optim import Eden, ScaledAdam
 from scaling import ScheduledFloat
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
@ -76,7 +90,7 @@ from torch.utils.tensorboard import SummaryWriter
 from icefall import diagnostics
 from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import remove_checkpoints
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.checkpoint import (
    save_checkpoint_with_global_batch_idx,
@ -109,9 +123,50 @@ from train import (
    set_batch_count,
 )
 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 def add_finetune_arguments(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--use-mux",
        type=str2bool,
        default=False,
        help="""
        Whether to adapt. If true, we will mix 5% of the new data
        with 95% of the original data to fine-tune.
        """,
    )
    parser.add_argument(
        "--init-modules",
        type=str,
        default=None,
        help="""
        Modules to be initialized. It matches all parameters starting with
        a specific key. The keys are given with Comma seperated. If None,
        all modules will be initialised. For example, if you only want to
        initialise all parameters staring with "encoder", use "encoder";
        if you want to initialise parameters starting with encoder or decoder,
        use "encoder,joiner".
        """,
    )
    parser.add_argument(
        "--finetune-ckpt",
        type=str,
        default=None,
        help="Fine-tuning from which checkpoint (a path to a .pt file)",
    )
    parser.add_argument(
        "--continue-finetune",
        type=str2bool,
        default=False,
        help="Continue finetuning or finetune from pre-trained model",
    )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
@ -148,10 +203,58 @@ def get_parser():
    add_training_arguments(parser)
    add_model_arguments(parser)
    add_finetune_arguments(parser)
    return parser
 def load_model_params(
    ckpt: str, model: nn.Module, init_modules: List[str] = None, strict: bool = True
 ):
    """Load model params from checkpoint
    Args:
        ckpt (str): Path to the checkpoint
        model (nn.Module): model to be loaded
    """
    logging.info(f"Loading checkpoint from {ckpt}")
    checkpoint = torch.load(ckpt, map_location="cpu")
    # if module list is empty, load the whole model from ckpt
    if not init_modules:
        if next(iter(checkpoint["model"])).startswith("module."):
            logging.info("Loading checkpoint saved by DDP")
            dst_state_dict = model.state_dict()
            src_state_dict = checkpoint["model"]
            for key in dst_state_dict.keys():
                src_key = "{}.{}".format("module", key)
                dst_state_dict[key] = src_state_dict.pop(src_key)
            assert len(src_state_dict) == 0
            model.load_state_dict(dst_state_dict, strict=strict)
        else:
            model.load_state_dict(checkpoint["model"], strict=strict)
    else:
        src_state_dict = checkpoint["model"]
        dst_state_dict = model.state_dict()
        for module in init_modules:
            logging.info(f"Loading parameters starting with prefix {module}")
            src_keys = [
                k for k in src_state_dict.keys() if k.startswith(module.strip() + ".")
            ]
            dst_keys = [
                k for k in dst_state_dict.keys() if k.startswith(module.strip() + ".")
            ]
            assert set(src_keys) == set(dst_keys)  # two sets should match exactly
            for key in src_keys:
                dst_state_dict[key] = src_state_dict.pop(key)
        model.load_state_dict(dst_state_dict, strict=strict)
    return None
 def compute_loss(
    params: AttributeDict,
    model: Union[nn.Module, DDP],
@ -160,7 +263,7 @@ def compute_loss(
    is_training: bool,
 ) -> Tuple[Tensor, MetricsTracker]:
    """
-    Compute CTC loss given the model and its inputs.
+    Compute loss given the model and its inputs.
    Args:
      params:
@ -191,10 +294,10 @@ def compute_loss(
    texts = batch["supervisions"]["text"]
    y = graph_compiler.texts_to_ids(texts, sep="/")
-    y = k2.RaggedTensor(y).to(device)
+    y = k2.RaggedTensor(y)
    with torch.set_grad_enabled(is_training):
-        simple_loss, pruned_loss, _ = model(
+        simple_loss, pruned_loss, ctc_loss = model(
            x=feature,
            x_lens=feature_lens,
            y=y,
@ -203,21 +306,26 @@ def compute_loss(
            lm_scale=params.lm_scale,
        )
-        s = params.simple_loss_scale
+        loss = 0.0
        # take down the scale on the simple loss from 1.0 at the start
        # to params.simple_loss scale by warm_step.
        simple_loss_scale = (
            s
            if batch_idx_train >= warm_step
            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
        )
        pruned_loss_scale = (
            1.0
            if batch_idx_train >= warm_step
            else 0.1 + 0.9 * (batch_idx_train / warm_step)
        )
-        loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
+        if params.use_transducer:
            s = params.simple_loss_scale
            # take down the scale on the simple loss from 1.0 at the start
            # to params.simple_loss scale by warm_step.
            simple_loss_scale = (
                s
                if batch_idx_train >= warm_step
                else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
            )
            pruned_loss_scale = (
                1.0
                if batch_idx_train >= warm_step
                else 0.1 + 0.9 * (batch_idx_train / warm_step)
            )
            loss += simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
        if params.use_ctc:
            loss += params.ctc_loss_scale * ctc_loss
    assert loss.requires_grad == is_training
@ -228,8 +336,11 @@ def compute_loss(
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
-    info["simple_loss"] = simple_loss.detach().cpu().item()
+    if params.use_transducer:
-    info["pruned_loss"] = pruned_loss.detach().cpu().item()
+        info["simple_loss"] = simple_loss.detach().cpu().item()
        info["pruned_loss"] = pruned_loss.detach().cpu().item()
    if params.use_ctc:
        info["ctc_loss"] = ctc_loss.detach().cpu().item()
    return loss, info
@ -317,8 +428,6 @@ def train_one_epoch(
    tot_loss = MetricsTracker()
    cur_batch_idx = params.get("cur_batch_idx", 0)
    saved_bad_model = False
    def save_bad_model(suffix: str = ""):
@ -336,10 +445,7 @@ def train_one_epoch(
    for batch_idx, batch in enumerate(train_dl):
        if batch_idx % 10 == 0:
-            set_batch_count(model, get_adjusted_batch_count(params))
+            set_batch_count(model, get_adjusted_batch_count(params) + 100000)
        if batch_idx < cur_batch_idx:
            continue
        cur_batch_idx = batch_idx
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
@ -359,6 +465,7 @@ def train_one_epoch(
            # NOTE: We use reduction==sum and loss is computed over utterances
            # in the batch and there is no normalization to it so far.
            scaler.scale(loss).backward()
            scheduler.step_batch(params.batch_idx_train)
            scaler.step(optimizer)
@ -387,7 +494,6 @@ def train_one_epoch(
            params.batch_idx_train > 0
            and params.batch_idx_train % params.save_every_n == 0
        ):
            params.cur_batch_idx = batch_idx
            save_checkpoint_with_global_batch_idx(
                out_dir=params.exp_dir,
                global_batch_idx=params.batch_idx_train,
@ -400,7 +506,6 @@ def train_one_epoch(
                scaler=scaler,
                rank=rank,
            )
            del params.cur_batch_idx
            remove_checkpoints(
                out_dir=params.exp_dir,
                topk=params.keep_last_k,
@ -532,14 +637,20 @@ def run(rank, world_size, args):
    assert params.save_every_n >= params.average_period
    model_avg: Optional[nn.Module] = None
    if rank == 0:
        # model_avg is only used with rank 0
        model_avg = copy.deepcopy(model).to(torch.float64)
-    assert params.start_epoch > 0, params.start_epoch
+    if params.continue_finetune:
-    checkpoints = load_checkpoint_if_available(
+        assert params.start_epoch > 0, params.start_epoch
-        params=params, model=model, model_avg=model_avg
+        checkpoints = load_checkpoint_if_available(
-    )
+            params=params, model=model, model_avg=model_avg
        )
    else:
        modules = params.init_modules.split(",") if params.init_modules else None
        checkpoints = load_model_params(
            ckpt=params.finetune_ckpt, model=model, init_modules=modules
        )
        if rank == 0:
            # model_avg is only used with rank 0
            model_avg = copy.deepcopy(model).to(torch.float64)
    model.to(device)
    if world_size > 1:
@ -552,7 +663,7 @@ def run(rank, world_size, args):
        clipping_scale=2.0,
    )
-    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs, warmup_start=1.0)
    if checkpoints and "optimizer" in checkpoints:
        logging.info("Loading optimizer state dict")
@ -568,33 +679,31 @@ def run(rank, world_size, args):
    if params.print_diagnostics:
        opts = diagnostics.TensorDiagnosticOptions(
-            2**22
+            512
        )  # allow 4 megabytes per sub-module
        diagnostic = diagnostics.attach_diagnostics(model, opts)
    if params.inf_check:
        register_inf_check_hooks(model)
    def remove_short_utt(c: Cut):
        if c.duration > 15:
            return False
        # In ./zipformer.py, the conv module uses the following expression
        # for subsampling
        T = ((c.num_frames - 7) // 2 + 1) // 2
        return T > 0
    wenetspeech = WenetSpeechAsrDataModule(args)
-    train_cuts = wenetspeech.train_cuts()
+    if params.use_mux:
-
+        train_cuts = CutSet.mux(
-    def remove_short_and_long_utt(c: Cut):
+            wenetspeech.train_cuts(),
-        # Keep only utterances with duration between 1 second and 20 seconds
+            wenetspeech.nihaowenwen_train_cuts(),
-        #
+            weights=[0.9, 0.1],
-        # Caution: There is a reason to select 20.0 here. Please see
+        )
-        # ../local/display_manifest_statistics.py
+    else:
-        #
+        train_cuts = wenetspeech.nihaowenwen_train_cuts()
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        if c.duration < 1.0 or c.duration > 15.0:
            # logging.warning(
            #     f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
            # )
            return False
        return True
    def encode_text(c: Cut):
        # Text normalize for each sample
@ -605,7 +714,7 @@ def run(rank, world_size, args):
        c.supervisions[0].text = text
        return c
-    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+    train_cuts = train_cuts.filter(remove_short_utt)
    train_cuts = train_cuts.map(encode_text)
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
@ -619,19 +728,19 @@ def run(rank, world_size, args):
        train_cuts, sampler_state_dict=sampler_state_dict
    )
-    valid_cuts = wenetspeech.valid_cuts()
+    valid_cuts = wenetspeech.nihaowenwen_dev_cuts()
    valid_cuts = valid_cuts.filter(remove_short_utt)
    valid_cuts = valid_cuts.map(encode_text)
    valid_dl = wenetspeech.valid_dataloaders(valid_cuts)
-    if not params.print_diagnostics:
+    if not params.print_diagnostics and params.scan_for_oom_batches:
-        # scan_pessimistic_batches_for_oom(
+        scan_pessimistic_batches_for_oom(
-        #    model=model,
+            model=model,
-        #    train_dl=train_dl,
+            train_dl=train_dl,
-        #    optimizer=optimizer,
+            optimizer=optimizer,
-        #    graph_compiler=graph_compiler,
+            graph_compiler=graph_compiler,
-        #    params=params,
+            params=params,
-        # )
+        )
        pass
    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
    if checkpoints and "grad_scaler" in checkpoints:
@ -689,7 +798,6 @@ def main():
    parser = get_parser()
    WenetSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.lang_dir = Path(args.lang_dir)
    args.exp_dir = Path(args.exp_dir)
    world_size = args.world_size
@ -701,4 +809,6 @@ def main():
 if __name__ == "__main__":
    torch.set_num_threads(1)
    torch.set_num_interop_threads(1)
    main()
--- a/egs/wenetspeech/KWS/zipformer/scaling_converter.py
+++ b/egs/wenetspeech/KWS/zipformer/scaling_converter.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling_converter.py
--- a/egs/wenetspeech/KWS/zipformer/train.py
+++ b/egs/wenetspeech/KWS/zipformer/train.py
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -1609,9 +1609,9 @@ def text_to_pinyin(
        The input Chinese text.
      mode:
        The style of the output pinyin, should be:
-          full_with_tone : zhong1 guo2
+          full_with_tone : zhōng guó
          full_no_tone : zhong guo
-          partial_with_tone : zh ong1 g uo2
+          partial_with_tone : zh ōng g uó
          partial_no_tone : zh ong g uo
      errors:
        How to handle the characters (latin) that has no pinyin.
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/scaling_converter.py`