Merge branch 'k2-fsa:master' into dev/k2ssl

2025-12-11 06:55:27 +00:00 · 2025-02-04 14:22:16 +08:00 · 2025-02-04 14:22:16 +08:00 · 279d34b7f4
commit 279d34b7f4
parent cf5fd1a2e0 da597ad782
54 changed files with 7047 additions and 104 deletions
--- a/egs/multi_ja_en/ASR/README.md
+++ b/egs/multi_ja_en/ASR/README.md
@ -0,0 +1,17 @@
 # Introduction
 A bilingual Japanese-English ASR model that utilizes ReazonSpeech, developed by the developers of ReazonSpeech.
 **ReazonSpeech** is an open-source dataset that contains a diverse set of natural Japanese speech, collected from terrestrial television streams. It contains more than 35,000 hours of audio.
 # Included Training Sets
 1. LibriSpeech (English)
 2. ReazonSpeech (Japanese)
 |Datset| Number of hours| URL|
 |---|---:|---|
 |**TOTAL**|35,960|---|
 |LibriSpeech|960|https://www.openslr.org/12/|
 |ReazonSpeech (all) |35,000|https://huggingface.co/datasets/reazon-research/reazonspeech|
--- a/egs/multi_ja_en/ASR/RESULTS.md
+++ b/egs/multi_ja_en/ASR/RESULTS.md
@ -0,0 +1,52 @@
 ## Results
 ### Zipformer
 #### Non-streaming
 The training command is:
 ```shell
 ./zipformer/train.py \
  --bilingual 1 \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 1 \
  --use-fp16 1 \
  --exp-dir zipformer/exp \
  --max-duration 600
 ```
 The decoding command is:
 ```shell
 ./zipformer/decode.py \
    --epoch 28 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --max-duration 600 \
    --decoding-method greedy_search
 ```
 To export the model with onnx:
 ```shell
 ./zipformer/export-onnx.py   --tokens data/lang_bbpe_2000/tokens.txt   --use-averaged-model 0   --epoch 35   --avg 1   --exp-dir zipformer/exp   --num-encoder-layers "2,2,3,4,3,2"   --downsampling-factor "1,2,4,8,4,2"   --feedforward-dim "512,768,1024,1536,1024,768"   --num-heads "4,4,4,8,4,4"   --encoder-dim "192,256,384,512,384,256"   --query-head-dim 32   --value-head-dim 12   --pos-head-dim 4   --pos-dim 48   --encoder-unmasked-dim "192,192,256,256,256,192"   --cnn-module-kernel "31,31,15,15,15,31"   --decoder-dim 512   --joiner-dim 512   --causal False   --chunk-size "16,32,64,-1"   --left-context-frames "64,128,256,-1"   --fp16 True
 ```
 Word Error Rates (WERs) listed below:
 |       Datasets       | ReazonSpeech |  ReazonSpeech |     LibriSpeech    |    LibriSpeech    |
 |----------------------|--------------|---------------|--------------------|-------------------|
 |   Zipformer WER (%)  |     dev      |     test      |     test-clean     |    test-other     |
 |     greedy_search    |     5.9      |     4.07      |        3.46        |       8.35        |
 | modified_beam_search |    4.87      |     3.61      |        3.28        |       8.07        |
 Character Error Rates (CERs) for Japanese listed below:
 |   Decoding Method    | In-Distribution CER | JSUT | CommonVoice | TEDx  |
 | :------------------: | :-----------------: | :--: | :---------: | :---: | 
 |    greedy search     |        12.56        | 6.93 |    9.75     | 9.67  | 
 | modified beam search |        11.59        | 6.97 |    9.55     | 9.51  | 
 Pre-trained model can be found here: https://huggingface.co/reazon-research/reazonspeech-k2-v2-ja-en/tree/main
--- a/egs/multi_ja_en/ASR/local/compute_fbank_reazonspeech.py
+++ b/egs/multi_ja_en/ASR/local/compute_fbank_reazonspeech.py
@ -0,0 +1,146 @@
 #!/usr/bin/env python3
 # Copyright    2023  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 import os
 from pathlib import Path
 from typing import List, Tuple
 import torch
 # fmt: off
 from lhotse import (  # See the following for why LilcomChunkyWriter is preferred; https://github.com/k2-fsa/icefall/pull/404; https://github.com/lhotse-speech/lhotse/pull/527
    CutSet,
    Fbank,
    FbankConfig,
    LilcomChunkyWriter,
    RecordingSet,
    SupervisionSet,
 )
 # fmt: on
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 RNG_SEED = 42
 concat_params = {"gap": 1.0, "maxlen": 10.0}
 def make_cutset_blueprints(
    manifest_dir: Path,
 ) -> List[Tuple[str, CutSet]]:
    cut_sets = []
    # Create test dataset
    logging.info("Creating test cuts.")
    cut_sets.append(
        (
            "test",
            CutSet.from_manifests(
                recordings=RecordingSet.from_file(
                    manifest_dir / "reazonspeech_recordings_test.jsonl.gz"
                ),
                supervisions=SupervisionSet.from_file(
                    manifest_dir / "reazonspeech_supervisions_test.jsonl.gz"
                ),
            ),
        )
    )
    # Create dev dataset
    logging.info("Creating dev cuts.")
    cut_sets.append(
        (
            "dev",
            CutSet.from_manifests(
                recordings=RecordingSet.from_file(
                    manifest_dir / "reazonspeech_recordings_dev.jsonl.gz"
                ),
                supervisions=SupervisionSet.from_file(
                    manifest_dir / "reazonspeech_supervisions_dev.jsonl.gz"
                ),
            ),
        )
    )
    # Create train dataset
    logging.info("Creating train cuts.")
    cut_sets.append(
        (
            "train",
            CutSet.from_manifests(
                recordings=RecordingSet.from_file(
                    manifest_dir / "reazonspeech_recordings_train.jsonl.gz"
                ),
                supervisions=SupervisionSet.from_file(
                    manifest_dir / "reazonspeech_supervisions_train.jsonl.gz"
                ),
            ),
        )
    )
    return cut_sets
 def get_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("-m", "--manifest-dir", type=Path)
    return parser.parse_args()
 def main():
    args = get_args()
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    num_jobs = min(16, os.cpu_count())
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    if (args.manifest_dir / ".reazonspeech-fbank.done").exists():
        logging.info(
            "Previous fbank computed for ReazonSpeech found. "
            f"Delete {args.manifest_dir / '.reazonspeech-fbank.done'} to allow recomputing fbank."
        )
        return
    else:
        cut_sets = make_cutset_blueprints(args.manifest_dir)
        for part, cut_set in cut_sets:
            logging.info(f"Processing {part}")
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                num_jobs=num_jobs,
                storage_path=(args.manifest_dir / f"feats_{part}").as_posix(),
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(args.manifest_dir / f"reazonspeech_cuts_{part}.jsonl.gz")
        logging.info("All fbank computed for ReazonSpeech.")
        (args.manifest_dir / ".reazonspeech-fbank.done").touch()
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/display_manifest_statistics.py
+++ b/egs/multi_ja_en/ASR/local/display_manifest_statistics.py
@ -0,0 +1,58 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #              2022  The University of Electro-Communications (author: Teo Wen Shen)  # noqa
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 from pathlib import Path
 from lhotse import CutSet, load_manifest
 ARGPARSE_DESCRIPTION = """
 This file displays duration statistics of utterances in a manifest.
 You can use the displayed value to choose minimum/maximum duration
 to remove short and long utterances during the training.
 See the function `remove_short_and_long_utt()` in
 pruned_transducer_stateless5/train.py for usage.
 """
 def get_parser():
    parser = argparse.ArgumentParser(
        description=ARGPARSE_DESCRIPTION,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--manifest-dir", type=Path, help="Path to cutset manifests")
    return parser.parse_args()
 def main():
    args = get_parser()
    for part in ["train", "dev"]:
        path = args.manifest_dir / f"reazonspeech_cuts_{part}.jsonl.gz"
        cuts: CutSet = load_manifest(path)
        print("\n---------------------------------\n")
        print(path.name + ":")
        cuts.describe()
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/prepare_char.py
+++ b/egs/multi_ja_en/ASR/local/prepare_char.py
@ -0,0 +1 @@
 ../../../aishell/ASR/local/prepare_char.py
--- a/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
+++ b/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
@ -0,0 +1,66 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (authors: Zengrui Jin)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script tokenizes the training transcript by CJK characters
 # and saves the result to transcript_chars.txt, which is used
 # to train the BPE model later.
 import argparse
 import re
 from pathlib import Path
 from tqdm.auto import tqdm
 from icefall.utils import tokenize_by_ja_char
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Output directory.
        The generated transcript_chars.txt is saved to this directory.
        """,
    )
    parser.add_argument(
        "--text",
        type=str,
        help="Training transcript.",
    )
    return parser.parse_args()
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    text = Path(args.text)
    assert lang_dir.exists() and text.exists(), f"{lang_dir} or {text} does not exist!"
    transcript_path = lang_dir / "transcript_chars.txt"
    with open(text, "r", encoding="utf-8") as fin:
        with open(transcript_path, "w+", encoding="utf-8") as fout:
            for line in tqdm(fin):
                fout.write(tokenize_by_ja_char(line) + "\n")
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/prepare_lang.py
+++ b/egs/multi_ja_en/ASR/local/prepare_lang.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/prepare_lang.py
--- a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
+++ b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
@ -0,0 +1,268 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
 #                                                  Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input `lang_dir`, which should contain::
    - lang_dir/bbpe.model,
    - lang_dir/words.txt
 and generates the following files in the directory `lang_dir`:
    - lexicon.txt
    - lexicon_disambig.txt
    - L.pt
    - L_disambig.pt
    - tokens.txt
 """
 import argparse
 import re
 from pathlib import Path
 from typing import Dict, List, Tuple
 import k2
 import sentencepiece as spm
 import torch
 from prepare_lang import (
    Lexicon,
    add_disambig_symbols,
    add_self_loops,
    write_lexicon,
    write_mapping,
 )
 from icefall.byte_utils import byte_encode
 from icefall.utils import str2bool, tokenize_by_ja_char
 def lexicon_to_fst_no_sil(
    lexicon: Lexicon,
    token2id: Dict[str, int],
    word2id: Dict[str, int],
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format).
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
      token2id:
        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    loop_state = 0  # words enter and leave from here
    next_state = 1  # the next un-allocated state, will be incremented as we go
    arcs = []
    # The blank symbol <blk> is defined in local/train_bpe_model.py
    assert token2id["<blk>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
    for word, pieces in lexicon:
        assert len(pieces) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
        pieces = [token2id[i] for i in pieces]
        for i in range(len(pieces) - 1):
            w = word if i == 0 else eps
            arcs.append([cur_state, next_state, pieces[i], w, 0])
            cur_state = next_state
            next_state += 1
        # now for the last piece of this word
        i = len(pieces) - 1
        w = word if i == 0 else eps
        arcs.append([cur_state, loop_state, pieces[i], w, 0])
    if need_self_loops:
        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
            arcs,
            disambig_token=disambig_token,
            disambig_word=disambig_word,
        )
    final_state = next_state
    arcs.append([loop_state, final_state, -1, -1, 0])
    arcs.append([final_state])
    arcs = sorted(arcs, key=lambda arc: arc[0])
    arcs = [[str(i) for i in arc] for arc in arcs]
    arcs = [" ".join(arc) for arc in arcs]
    arcs = "\n".join(arcs)
    fsa = k2.Fsa.from_str(arcs, acceptor=False)
    return fsa
 def generate_lexicon(
    model_file: str, words: List[str], oov: str
 ) -> Tuple[Lexicon, Dict[str, int]]:
    """Generate a lexicon from a BPE model.
    Args:
      model_file:
        Path to a sentencepiece model.
      words:
        A list of strings representing words.
      oov:
        The out of vocabulary word in lexicon.
    Returns:
      Return a tuple with two elements:
        - A dict whose keys are words and values are the corresponding
          word pieces.
        - A dict representing the token symbol, mapping from tokens to IDs.
    """
    sp = spm.SentencePieceProcessor()
    sp.load(str(model_file))
    # Convert word to word piece IDs instead of word piece strings
    # to avoid OOV tokens.
    encode_words = [byte_encode(tokenize_by_ja_char(w)) for w in words]
    words_pieces_ids: List[List[int]] = sp.encode(encode_words, out_type=int)
    # Now convert word piece IDs back to word piece strings.
    words_pieces: List[List[str]] = [sp.id_to_piece(ids) for ids in words_pieces_ids]
    lexicon = []
    for word, pieces in zip(words, words_pieces):
        lexicon.append((word, pieces))
    lexicon.append((oov, ["▁", sp.id_to_piece(sp.unk_id())]))
    token2id: Dict[str, int] = {sp.id_to_piece(i): i for i in range(sp.vocab_size())}
    return lexicon, token2id
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        It should contain the bpe.model and words.txt
        """,
    )
    parser.add_argument(
        "--oov",
        type=str,
        default="<UNK>",
        help="The out of vocabulary word in lexicon.",
    )
    parser.add_argument(
        "--debug",
        type=str2bool,
        default=False,
        help="""True for debugging, which will generate
        a visualization of the lexicon FST.
        Caution: If your lexicon contains hundreds of thousands
        of lines, please set it to False!
        See "test/test_bpe_lexicon.py" for usage.
        """,
    )
    return parser.parse_args()
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    model_file = lang_dir / "bbpe.model"
    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
    words = word_sym_table.symbols
    excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", args.oov, "#0", "<s>", "</s>"]
    for w in excluded:
        if w in words:
            words.remove(w)
    lexicon, token_sym_table = generate_lexicon(model_file, words, args.oov)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    next_token_id = max(token_sym_table.values()) + 1
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
        assert disambig not in token_sym_table
        token_sym_table[disambig] = next_token_id
        next_token_id += 1
    word_sym_table.add("#0")
    word_sym_table.add("<s>")
    word_sym_table.add("</s>")
    write_mapping(lang_dir / "tokens.txt", token_sym_table)
    write_lexicon(lang_dir / "lexicon.txt", lexicon)
    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst_no_sil(
        lexicon,
        token2id=token_sym_table,
        word2id=word_sym_table,
    )
    L_disambig = lexicon_to_fst_no_sil(
        lexicon_disambig,
        token2id=token_sym_table,
        word2id=word_sym_table,
        need_self_loops=True,
    )
    torch.save(L.as_dict(), lang_dir / "L.pt")
    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
    if args.debug:
        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
        L.labels_sym = labels_sym
        L.aux_labels_sym = aux_labels_sym
        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
        L_disambig.labels_sym = labels_sym
        L_disambig.aux_labels_sym = aux_labels_sym
        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/prepare_lang_char.py
+++ b/egs/multi_ja_en/ASR/local/prepare_lang_char.py
@ -0,0 +1,75 @@
 #!/usr/bin/env python3
 # Copyright    2022  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from pathlib import Path
 from lhotse import CutSet
 def get_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "train_cut", metavar="train-cut", type=Path, help="Path to the train cut"
    )
    parser.add_argument(
        "--lang-dir",
        type=Path,
        default=Path("data/lang_char"),
        help=(
            "Name of lang dir. "
            "If not set, this will default to lang_char_{trans-mode}"
        ),
    )
    return parser.parse_args()
 def main():
    args = get_args()
    logging.basicConfig(
        format=("%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"),
        level=logging.INFO,
    )
    sysdef_string = set(["<blk>", "<unk>", "<sos/eos>", " "])
    token_set = set()
    logging.info(f"Creating vocabulary from {args.train_cut}.")
    train_cut: CutSet = CutSet.from_file(args.train_cut)
    for cut in train_cut:
        for sup in cut.supervisions:
            token_set.update(sup.text)
    token_set = ["<blk>"] + sorted(token_set - sysdef_string) + ["<unk>", "<sos/eos>"]
    args.lang_dir.mkdir(parents=True, exist_ok=True)
    (args.lang_dir / "tokens.txt").write_text(
        "\n".join(f"{t}\t{i}" for i, t in enumerate(token_set))
    )
    (args.lang_dir / "lang_type").write_text("char")
    logging.info("Done.")
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/prepare_words.py
+++ b/egs/multi_ja_en/ASR/local/prepare_words.py
@ -0,0 +1 @@
 ../../../aishell2/ASR/local/prepare_words.py
--- a/egs/multi_ja_en/ASR/local/text2segments.py
+++ b/egs/multi_ja_en/ASR/local/text2segments.py
@ -0,0 +1,95 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Copyright    2021  Xiaomi Corp.        (authors: Mingshuang Luo)
 #              2022  Xiaomi Corp.        (authors: Weiji Zhuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input "text", which refers to the transcript file:
    - text
 and generates the output file with word segmentation implemented using MeCab:
    - text_words_segmentation
 """
 import argparse
 from multiprocessing import Pool
 import MeCab
 from tqdm import tqdm
 def get_parser():
    parser = argparse.ArgumentParser(
        description="Japanese Word Segmentation for text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--num-process",
        "-n",
        default=20,
        type=int,
        help="the number of processes",
    )
    parser.add_argument(
        "--input-file",
        "-i",
        default="data/lang_char/text",
        type=str,
        help="the input text file",
    )
    parser.add_argument(
        "--output-file",
        "-o",
        default="data/lang_char/text_words_segmentation",
        type=str,
        help="the text implemented with word segmentation using MeCab",
    )
    return parser
 def cut(lines):
    if lines is not None:
        mecab = MeCab.Tagger("-Owakati")  # Use '-Owakati' option for word segmentation
        segmented_line = mecab.parse(lines).strip()
        return segmented_line.split()  # Return as a list of words
    else:
        return None
 def main():
    parser = get_parser()
    args = parser.parse_args()
    num_process = args.num_process
    input_file = args.input_file
    output_file = args.output_file
    with open(input_file, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    with Pool(processes=num_process) as p:
        new_lines = list(tqdm(p.imap(cut, lines), total=len(lines)))
    with open(output_file, "w", encoding="utf-8") as fw:
        for line in new_lines:
            fw.write(" ".join(line) + "\n")
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/text2token.py
+++ b/egs/multi_ja_en/ASR/local/text2token.py
@ -0,0 +1,177 @@
 #!/usr/bin/env python3
 # Copyright    2017  Johns Hopkins University   (authors: Shinji Watanabe)
 #              2022  Xiaomi Corp.               (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import codecs
 import re
 import sys
 from typing import List
 from romkan import to_roma  # Replace with python-romkan v0.2.1
 is_python2 = sys.version_info[0] == 2
 def exist_or_not(i, match_pos):
    start_pos = None
    end_pos = None
    for pos in match_pos:
        if pos[0] <= i < pos[1]:
            start_pos = pos[0]
            end_pos = pos[1]
            break
    return start_pos, end_pos
 def get_parser():
    parser = argparse.ArgumentParser(
        description="convert raw text to tokenized text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--nchar",
        "-n",
        default=1,
        type=int,
        help="number of characters to split, i.e., \
                        aabb -> a a b b with -n 1 and aa bb with -n 2",
    )
    parser.add_argument(
        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
    )
    parser.add_argument("--space", default="<space>", type=str, help="space symbol")
    parser.add_argument(
        "--non-lang-syms",
        "-l",
        default=None,
        type=str,
        help="list of non-linguistic symbols, e.g., <NOISE> etc.",
    )
    parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
    parser.add_argument(
        "--trans_type",
        "-t",
        type=str,
        default="char",
        choices=["char", "romaji"],
        help="Transcript type. char/romaji",
    )
    return parser
 def token2id(
    texts, token_table, token_type: str = "romaji", oov: str = "<unk>"
 ) -> List[List[int]]:
    """Convert token to id.
    Args:
      texts:
        The input texts, it refers to the Japanese text here.
      token_table:
        The token table is built based on "data/lang_xxx/token.txt"
      token_type:
        The type of token, such as "romaji".
      oov:
        Out of vocabulary token. When a word(token) in the transcript
        does not exist in the token list, it is replaced with `oov`.
    Returns:
      The list of ids for the input texts.
    """
    if texts is None:
        raise ValueError("texts can't be None!")
    else:
        oov_id = token_table[oov]
        ids: List[List[int]] = []
        for text in texts:
            chars_list = list(str(text))
            if token_type == "romaji":
                text = [to_roma(c) for c in chars_list]
                sub_ids = [
                    token_table[txt] if txt in token_table else oov_id for txt in text
                ]
                ids.append(sub_ids)
        return ids
 def main():
    parser = get_parser()
    args = parser.parse_args()
    rs = []
    if args.non_lang_syms is not None:
        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
            nls = [x.rstrip() for x in f.readlines()]
            rs = [re.compile(re.escape(x)) for x in nls]
    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
        f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    line = f.readline()
    n = args.nchar
    while line:
        x = line.split()
        print(" ".join(x[: args.skip_ncols]), end=" ")
        a = " ".join(x[args.skip_ncols :])  # noqa E203
        # get all matched positions
        match_pos = []
        for r in rs:
            i = 0
            while i >= 0:
                m = r.search(a, i)
                if m:
                    match_pos.append([m.start(), m.end()])
                    i = m.end()
                else:
                    break
        if len(match_pos) > 0:
            chars = []
            i = 0
            while i < len(a):
                start_pos, end_pos = exist_or_not(i, match_pos)
                if start_pos is not None:
                    chars.append(a[start_pos:end_pos])
                    i = end_pos
                else:
                    chars.append(a[i])
                    i += 1
            a = chars
        if args.trans_type == "romaji":
            a = [to_roma(c) for c in list(str(a))]
        a = [a[j : j + n] for j in range(0, len(a), n)]  # noqa E203
        a_flat = []
        for z in a:
            a_flat.append("".join(z))
        a_chars = "".join(a_flat)
        print(a_chars)
        line = f.readline()
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/train_bbpe_model.py
+++ b/egs/multi_ja_en/ASR/local/train_bbpe_model.py
@ -0,0 +1,114 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (authors: Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # You can install sentencepiece via:
 #
 #  pip install sentencepiece
 #
 # Due to an issue reported in
 # https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
 #
 # Please install a version >=0.1.96
 import argparse
 import re
 import shutil
 import tempfile
 from pathlib import Path
 import sentencepiece as spm
 from icefall import byte_encode
 from icefall.utils import tokenize_by_ja_char
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        The generated bpe.model is saved to this directory.
        """,
    )
    parser.add_argument(
        "--transcript",
        type=str,
        help="Training transcript.",
    )
    parser.add_argument(
        "--vocab-size",
        type=int,
        help="Vocabulary size for BPE training",
    )
    return parser.parse_args()
 def _convert_to_bchar(in_path: str, out_path: str):
    with open(out_path, "w") as f:
        for line in open(in_path, "r").readlines():
            f.write(byte_encode(tokenize_by_ja_char(line)) + "\n")
 def main():
    args = get_args()
    vocab_size = args.vocab_size
    lang_dir = Path(args.lang_dir)
    model_type = "unigram"
    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
    model_file = Path(model_prefix + ".model")
    if model_file.is_file():
        print(f"{model_file} exists - skipping")
        return
    character_coverage = 1.0
    input_sentence_size = 100000000
    user_defined_symbols = ["<blk>", "<sos/eos>"]
    unk_id = len(user_defined_symbols)
    # Note: unk_id is fixed to 2.
    # If you change it, you should also change other
    # places that are using it.
    temp = tempfile.NamedTemporaryFile()
    train_text = temp.name
    _convert_to_bchar(args.transcript, train_text)
    spm.SentencePieceTrainer.train(
        input=train_text,
        vocab_size=vocab_size,
        model_type=model_type,
        model_prefix=model_prefix,
        input_sentence_size=input_sentence_size,
        character_coverage=character_coverage,
        user_defined_symbols=user_defined_symbols,
        unk_id=unk_id,
        bos_id=-1,
        eos_id=-1,
    )
    shutil.copyfile(model_file, f"{lang_dir}/bbpe.model")
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/local/utils/asr_datamodule.py
+++ b/egs/multi_ja_en/ASR/local/utils/asr_datamodule.py
@ -0,0 +1,355 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
 from lhotse.dataset import (
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SimpleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 class ReazonSpeechAsrDataModule:
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/dev/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--drop-last",
            type=str2bool,
            default=True,
            help="Whether to drop last batch. Used by sampler.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=False,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
        group.add_argument(
            "--enable-spec-aug",
            type=str2bool,
            default=True,
            help="When enabled, use SpecAugment for training dataset.",
        )
        group.add_argument(
            "--spec-aug-time-warp-factor",
            type=int,
            default=80,
            help="Used only when --enable-spec-aug is True. "
            "It specifies the factor for time warping in SpecAugment. "
            "Larger values mean more warping. "
            "A value less than 1 means to disable time warp.",
        )
        group.add_argument(
            "--enable-musan",
            type=str2bool,
            default=False,
            help="When enabled, select noise from MUSAN and mix it"
            "with training dataset. ",
        )
    def train_dataloaders(
        self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None
    ) -> DataLoader:
        """
        Args:
          cuts_train:
            CutSet for training.
          sampler_state_dict:
            The state dict for the training sampler.
        """
        transforms = []
        input_transforms = []
        if self.args.enable_spec_aug:
            logging.info("Enable SpecAugment")
            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
                )
            )
        else:
            logging.info("Disable SpecAugment")
        logging.info("About to create train dataset")
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
                input_transforms=input_transforms,
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using DynamicBucketingSampler.")
            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                drop_last=self.args.drop_last,
            )
        else:
            logging.info("Using SimpleCutSampler.")
            train_sampler = SimpleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        if sampler_state_dict is not None:
            logging.info("Loading sampler state dict")
            train_sampler.load_state_dict(sampler_state_dict)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
        )
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.info("About to create test dataset")
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=sampler,
            num_workers=self.args.num_workers,
        )
        return test_dl
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "reazonspeech_cuts_train.jsonl.gz"
        )
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "reazonspeech_cuts_dev.jsonl.gz"
        )
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "reazonspeech_cuts_test.jsonl.gz"
        )
--- a/egs/multi_ja_en/ASR/local/utils/tokenizer.py
+++ b/egs/multi_ja_en/ASR/local/utils/tokenizer.py
@ -0,0 +1,252 @@
 import argparse
 from pathlib import Path
 from typing import Callable, List, Union
 import sentencepiece as spm
 from k2 import SymbolTable
 class Tokenizer:
    text2word: Callable[[str], List[str]]
    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser):
        group = parser.add_argument_group(title="Lang related options")
        group.add_argument("--lang", type=Path, help="Path to lang directory.")
        group.add_argument(
            "--lang-type",
            type=str,
            default=None,
            help=(
                "Either 'bpe' or 'char'. If not provided, it expects lang_dir/lang_type to exists. "
                "Note: 'bpe' directly loads sentencepiece.SentencePieceProcessor"
            ),
        )
    @staticmethod
    def Load(lang_dir: Path, lang_type="", oov="<unk>"):
        if not lang_type:
            assert (lang_dir / "lang_type").exists(), "lang_type not specified."
            lang_type = (lang_dir / "lang_type").read_text().strip()
        tokenizer = None
        if lang_type == "bpe":
            assert (
                lang_dir / "bpe.model"
            ).exists(), f"No BPE .model could be found in {lang_dir}."
            tokenizer = spm.SentencePieceProcessor()
            tokenizer.Load(str(lang_dir / "bpe.model"))
        elif lang_type == "char":
            tokenizer = CharTokenizer(lang_dir, oov=oov)
        else:
            raise NotImplementedError(f"{lang_type} not supported at the moment.")
        return tokenizer
    load = Load
    def PieceToId(self, piece: str) -> int:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    piece_to_id = PieceToId
    def IdToPiece(self, id: int) -> str:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    id_to_piece = IdToPiece
    def GetPieceSize(self) -> int:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    get_piece_size = GetPieceSize
    def __len__(self) -> int:
        return self.get_piece_size()
    def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def EncodeAsIds(self, input: str) -> List[int]:
        return self.EncodeAsIdsBatch([input])[0]
    def EncodeAsPieces(self, input: str) -> List[str]:
        return self.EncodeAsPiecesBatch([input])[0]
    def Encode(
        self, input: Union[str, List[str]], out_type=int
    ) -> Union[List, List[List]]:
        if not input:
            return []
        if isinstance(input, list):
            if out_type is int:
                return self.EncodeAsIdsBatch(input)
            if out_type is str:
                return self.EncodeAsPiecesBatch(input)
        if out_type is int:
            return self.EncodeAsIds(input)
        if out_type is str:
            return self.EncodeAsPieces(input)
    encode = Encode
    def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def DecodeIds(self, input: List[int]) -> str:
        return self.DecodeIdsBatch([input])[0]
    def DecodePieces(self, input: List[str]) -> str:
        return self.DecodePiecesBatch([input])[0]
    def Decode(
        self,
        input: Union[int, List[int], List[str], List[List[int]], List[List[str]]],
    ) -> Union[List[str], str]:
        if not input:
            return ""
        if isinstance(input, int):
            return self.id_to_piece(input)
        elif isinstance(input, str):
            raise TypeError(
                "Unlike spm.SentencePieceProcessor, cannot decode from type str."
            )
        if isinstance(input[0], list):
            if not input[0] or isinstance(input[0][0], int):
                return self.DecodeIdsBatch(input)
            if isinstance(input[0][0], str):
                return self.DecodePiecesBatch(input)
        if isinstance(input[0], int):
            return self.DecodeIds(input)
        if isinstance(input[0], str):
            return self.DecodePieces(input)
        raise RuntimeError("Unknown input type")
    decode = Decode
    def SplitBatch(self, input: List[str]) -> List[List[str]]:
        raise NotImplementedError(
            "You need to implement this function in the child class."
        )
    def Split(self, input: Union[List[str], str]) -> Union[List[List[str]], List[str]]:
        if isinstance(input, list):
            return self.SplitBatch(input)
        elif isinstance(input, str):
            return self.SplitBatch([input])[0]
        raise RuntimeError("Unknown input type")
    split = Split
 class CharTokenizer(Tokenizer):
    def __init__(self, lang_dir: Path, oov="<unk>", sep=""):
        assert (
            lang_dir / "tokens.txt"
        ).exists(), f"tokens.txt could not be found in {lang_dir}."
        token_table = SymbolTable.from_file(lang_dir / "tokens.txt")
        assert (
            "#0" not in token_table
        ), "This tokenizer does not support disambig symbols."
        self._id2sym = token_table._id2sym
        self._sym2id = token_table._sym2id
        self.oov = oov
        self.oov_id = self._sym2id[oov]
        self.sep = sep
        if self.sep:
            self.text2word = lambda x: x.split(self.sep)
        else:
            self.text2word = lambda x: list(x.replace(" ", ""))
    def piece_to_id(self, piece: str) -> int:
        try:
            return self._sym2id[piece]
        except KeyError:
            return self.oov_id
    def id_to_piece(self, id: int) -> str:
        return self._id2sym[id]
    def get_piece_size(self) -> int:
        return len(self._sym2id)
    def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
        return [[self.piece_to_id(i) for i in self.text2word(text)] for text in input]
    def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
        return [
            [i if i in self._sym2id else self.oov for i in self.text2word(text)]
            for text in input
        ]
    def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
        return [self.sep.join(self.id_to_piece(i) for i in text) for text in input]
    def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
        return [self.sep.join(text) for text in input]
    def SplitBatch(self, input: List[str]) -> List[List[str]]:
        return [self.text2word(text) for text in input]
 def test_CharTokenizer():
    test_single_string = "こんにちは"
    test_multiple_string = [
        "今日はいい天気ですよね",
        "諏訪湖は綺麗でしょう",
        "这在词表外",
        "分かち 書き に し た 文章 です",
        "",
    ]
    test_empty_string = ""
    sp = Tokenizer.load(Path("lang_char"), "char", oov="<unk>")
    splitter = sp.split
    print(sp.encode(test_single_string, out_type=str))
    print(sp.encode(test_single_string, out_type=int))
    print(sp.encode(test_multiple_string, out_type=str))
    print(sp.encode(test_multiple_string, out_type=int))
    print(sp.encode(test_empty_string, out_type=str))
    print(sp.encode(test_empty_string, out_type=int))
    print(sp.decode(sp.encode(test_single_string, out_type=str)))
    print(sp.decode(sp.encode(test_single_string, out_type=int)))
    print(sp.decode(sp.encode(test_multiple_string, out_type=str)))
    print(sp.decode(sp.encode(test_multiple_string, out_type=int)))
    print(sp.decode(sp.encode(test_empty_string, out_type=str)))
    print(sp.decode(sp.encode(test_empty_string, out_type=int)))
    print(splitter(test_single_string))
    print(splitter(test_multiple_string))
    print(splitter(test_empty_string))
 if __name__ == "__main__":
    test_CharTokenizer()
--- a/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py
+++ b/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/validate_bpe_lexicon.py
--- a/egs/multi_ja_en/ASR/local/validate_manifest.py
+++ b/egs/multi_ja_en/ASR/local/validate_manifest.py
@ -0,0 +1,96 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script checks the following assumptions of the generated manifest:
 - Single supervision per cut
 - Supervision time bounds are within cut time bounds
 We will add more checks later if needed.
 Usage example:
    python3 ./local/validate_manifest.py \
            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
 """
 import argparse
 import logging
 from pathlib import Path
 from lhotse import CutSet, load_manifest
 from lhotse.cut import Cut
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--manifest",
        type=Path,
        help="Path to the manifest file",
    )
    return parser.parse_args()
 def validate_one_supervision_per_cut(c: Cut):
    if len(c.supervisions) != 1:
        raise ValueError(f"{c.id} has {len(c.supervisions)} supervisions")
 def validate_supervision_and_cut_time_bounds(c: Cut):
    s = c.supervisions[0]
    # Removed because when the cuts were trimmed from supervisions,
    # the start time of the supervision can be lesser than cut start time.
    # https://github.com/lhotse-speech/lhotse/issues/813
    # if s.start < c.start:
    #     raise ValueError(
    #         f"{c.id}: Supervision start time {s.start} is less "
    #         f"than cut start time {c.start}"
    #     )
    if s.end > c.end:
        raise ValueError(
            f"{c.id}: Supervision end time {s.end} is larger "
            f"than cut end time {c.end}"
        )
 def main():
    args = get_args()
    manifest = Path(args.manifest)
    logging.info(f"Validating {manifest}")
    assert manifest.is_file(), f"{manifest} does not exist"
    cut_set = load_manifest(manifest)
    assert isinstance(cut_set, CutSet)
    for c in cut_set:
        validate_one_supervision_per_cut(c)
        validate_supervision_and_cut_time_bounds(c)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/multi_ja_en/ASR/prepare.sh
+++ b/egs/multi_ja_en/ASR/prepare.sh
@ -0,0 +1,185 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 stage=-1
 stop_stage=100
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 vocab_sizes=(
  2000
 )
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 log "Dataset: musan"
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Soft link fbank of musan"
  mkdir -p data/fbank
  if [ -e ../../librispeech/ASR/data/fbank/.musan.done ]; then
    cd data/fbank
    ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_feats) .
    ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) .
    cd ../..
  else
    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 4 --stop-stage 4"
    exit 1
  fi
 fi
 log "Dataset: LibriSpeech"
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 1: Soft link fbank of LibriSpeech"
  mkdir -p data/fbank
  if [ -e ../../librispeech/ASR/data/fbank/.librispeech.done ]; then
    cd data/fbank
    ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts*) .
    ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats*) .
    cd ../..
  else
    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 1 --stop-stage 1 and ../../librispeech/ASR/prepare.sh --stage 3 --stop-stage 3"
    exit 1
  fi
 fi
 log "Dataset: ReazonSpeech"
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 2: Soft link fbank of ReazonSpeech"
  mkdir -p data/fbank
  if [ -e ../../reazonspeech/ASR/data/manifests/.reazonspeech.done ]; then
    cd data/fbank
    ln -svf $(realpath ../../../../reazonspeech/ASR/data/manifests/reazonspeech_cuts*) .
    cd ..
    mkdir -p manifests
    cd manifests
    ln -svf $(realpath ../../../../reazonspeech/ASR/data/manifests/feats_*) .
    cd ../..
  else
    log "Abort! Please run ../../reazonspeech/ASR/prepare.sh --stage 0 --stop-stage 2"
    exit 1
  fi
 fi
 # New Stage 3: Prepare char based lang for ReazonSpeech
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  lang_char_dir=data/lang_char
  log "Stage 3: Prepare char based lang for ReazonSpeech"
  mkdir -p $lang_char_dir
  # Prepare text
  if [ ! -f $lang_char_dir/text ]; then
    gunzip -c ../../reazonspeech/ASR/data/manifests/reazonspeech_supervisions_train.jsonl.gz \
      | jq '.text' | sed 's/"//g' \
      | ./local/text2token.py -t "char" > $lang_char_dir/text
  fi
  # jp word segmentation for text
  if [ ! -f $lang_char_dir/text_words_segmentation ]; then
    python3 ./local/text2segments.py \
      --input-file $lang_char_dir/text \
      --output-file $lang_char_dir/text_words_segmentation
  fi
  cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \
    | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt
  if [ ! -f $lang_char_dir/words.txt ]; then
    python3 ./local/prepare_words.py \
      --input-file $lang_char_dir/words_no_ids.txt \
      --output-file $lang_char_dir/words.txt
  fi
  if [ ! -f $lang_char_dir/L_disambig.pt ]; then
    python3 ./local/prepare_char.py --lang-dir data/lang_char
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Prepare Byte BPE based lang"
  mkdir -p data/fbank
  if [ ! -d ../../reazonspeech/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then
    log "Abort! Please run ../../reazonspeech/ASR/prepare.sh --stage 3 --stop-stage 3"
    exit 1
  fi
  if [ ! -d ../../librispeech/ASR/data/lang_bpe_500 ] && [ ! -d ./data/lang_bpe_500 ]; then
    log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 5 --stop-stage 5"
    exit 1
  fi
  cd data/
  # if [ ! -d ./lang_char ]; then
  #   ln -svf $(realpath ../../../reazonspeech/ASR/data/lang_char) .
  # fi
  if [ ! -d ./lang_bpe_500 ]; then
    ln -svf $(realpath ../../../librispeech/ASR/data/lang_bpe_500) .
  fi
  cd ../
  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bbpe_${vocab_size}
    mkdir -p $lang_dir
    cat data/lang_char/text data/lang_bpe_500/transcript_words.txt \
      > $lang_dir/text
    if [ ! -f $lang_dir/transcript_chars.txt ]; then
      ./local/prepare_for_bpe_model.py \
        --lang-dir ./$lang_dir \
        --text $lang_dir/text
    fi
    if [ ! -f $lang_dir/text_words_segmentation ]; then
      python3 ./local/text2segments.py \
        --input-file ./data/lang_char/text \
        --output-file $lang_dir/text_words_segmentation
      cat ./data/lang_bpe_500/transcript_words.txt \
        >> $lang_dir/text_words_segmentation
    fi
    cat $lang_dir/text_words_segmentation | sed 's/ /\n/g' \
      | sort -u | sed '/^$/d' | uniq > $lang_dir/words_no_ids.txt
    if [ ! -f $lang_dir/words.txt ]; then
      python3 ./local/prepare_words.py \
        --input-file $lang_dir/words_no_ids.txt \
        --output-file $lang_dir/words.txt
    fi
    if [ ! -f $lang_dir/bbpe.model ]; then
      ./local/train_bbpe_model.py \
        --lang-dir $lang_dir \
        --vocab-size $vocab_size \
        --transcript $lang_dir/text
    fi
    if [ ! -f $lang_dir/L_disambig.pt ]; then
      ./local/prepare_lang_bbpe.py --lang-dir $lang_dir
      log "Validating $lang_dir/lexicon.txt"
      ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/
      ./local/validate_bpe_lexicon.py \
        --lexicon $lang_dir/lexicon.txt \
        --bpe-model $lang_dir/bbpe.model
    fi
  done
 fi
 log "prepare.sh: PREPARATION DONE"
--- a/egs/multi_ja_en/ASR/shared
+++ b/egs/multi_ja_en/ASR/shared
@ -0,0 +1 @@
 ../../../icefall/shared/
--- a/egs/multi_ja_en/ASR/zipformer/asr_datamodule.py
+++ b/egs/multi_ja_en/ASR/zipformer/asr_datamodule.py
@ -0,0 +1 @@
 ../local/utils/asr_datamodule.py
--- a/egs/multi_ja_en/ASR/zipformer/beam_search.py
+++ b/egs/multi_ja_en/ASR/zipformer/beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/beam_search.py
--- a/egs/multi_ja_en/ASR/zipformer/ctc_decode.py
+++ b/egs/multi_ja_en/ASR/zipformer/ctc_decode.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/ctc_decode.py
--- a/egs/multi_ja_en/ASR/zipformer/decode.py
+++ b/egs/multi_ja_en/ASR/zipformer/decode.py
@ -0,0 +1,792 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
 #                                                 Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 (1) greedy search
 ./zipformer/decode.py \
    --epoch 28 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --max-duration 600 \
    --decoding-method greedy_search
 (2) beam search (not recommended)
 ./zipformer/decode.py \
    --epoch 28 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --max-duration 600 \
    --decoding-method beam_search \
    --beam-size 4
 (3) modified beam search
 ./zipformer/decode.py \
    --epoch 28 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --max-duration 600 \
    --decoding-method modified_beam_search \
    --beam-size 4
 (4) fast beam search (one best)
 ./zipformer/decode.py \
    --epoch 28 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --max-duration 600 \
    --decoding-method fast_beam_search \
    --beam 20.0 \
    --max-contexts 8 \
    --max-states 64
 """
 import argparse
 import logging
 import math
 import re
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import ReazonSpeechAsrDataModule
 from beam_search import (
    beam_search,
    fast_beam_search_nbest,
    fast_beam_search_nbest_LG,
    fast_beam_search_nbest_oracle,
    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
 )
 from lhotse.cut import Cut
 from multi_dataset import MultiDataset
 from train import add_model_arguments, get_model, get_params
 from icefall import byte_encode, smart_byte_decode
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    str2bool,
    tokenize_by_ja_char,
    write_error_stats,
 )
 LOG_EPS = math.log(1e-10)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=30,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="zipformer/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bbpe_2000/bbpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--lang-dir",
        type=Path,
        default="data/lang_bbpe_2000",
        help="The lang dir containing word table and LG graph",
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
          - fast_beam_search
          - fast_beam_search_nbest
          - fast_beam_search_nbest_oracle
          - fast_beam_search_nbest_LG
        If you use fast_beam_search_nbest_LG, you have to specify
        `--lang-dir`, which should contain `LG.pt`.
        """,
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="""An integer indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is beam_search or
        modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=20.0,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search,
        fast_beam_search_nbest, fast_beam_search_nbest_LG,
        and fast_beam_search_nbest_oracle
        """,
    )
    parser.add_argument(
        "--ngram-lm-scale",
        type=float,
        default=0.01,
        help="""
        Used only when --decoding_method is fast_beam_search_nbest_LG.
        It specifies the scale for n-gram LM scores.
        """,
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=8,
        help="""Used only when --decoding-method is
        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=64,
        help="""Used only when --decoding-method is
        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=1,
        help="""Maximum number of symbols per frame.
        Used only when --decoding_method is greedy_search""",
    )
    parser.add_argument(
        "--num-paths",
        type=int,
        default=200,
        help="""Number of paths for nbest decoding.
        Used only when the decoding method is fast_beam_search_nbest,
        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--nbest-scale",
        type=float,
        default=0.5,
        help="""Scale applied to lattice scores when computing nbest paths.
        Used only when the decoding method is fast_beam_search_nbest,
        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
    )
    add_model_arguments(parser)
    return parser
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    batch: dict,
    word_table: Optional[k2.SymbolTable] = None,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
    """Decode one batch and return the result in a dict. The dict has the
    following format:
        - key: It indicates the setting used for decoding. For example,
               if greedy_search is used, it would be "greedy_search"
               If beam search with a beam size of 7 is used, it would be
               "beam_7"
        - value: It contains the decoding result. `len(value)` equals to
                 batch size. `value[i]` is the decoding result for the i-th
                 utterance in the given batch.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      sp:
        The BPE model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
      word_table:
        The word symbol table.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
    """
    device = next(model.parameters()).device
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device)
    # at entry, feature is (N, T, C)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    if params.causal:
        # this seems to cause insertions at the end of the utterance if used with zipformer.
        pad_len = 30
        feature_lens += pad_len
        feature = torch.nn.functional.pad(
            feature,
            pad=(0, 0, 0, pad_len),
            value=LOG_EPS,
        )
    encoder_out, encoder_out_lens = model.forward_encoder(feature, feature_lens)
    hyps = []
    if params.decoding_method == "fast_beam_search":
        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.decoding_method == "fast_beam_search_nbest_LG":
        hyp_tokens = fast_beam_search_nbest_LG(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            num_paths=params.num_paths,
            nbest_scale=params.nbest_scale,
        )
        for hyp in hyp_tokens:
            hyps.append([word_table[i] for i in hyp])
    elif params.decoding_method == "fast_beam_search_nbest":
        hyp_tokens = fast_beam_search_nbest(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            num_paths=params.num_paths,
            nbest_scale=params.nbest_scale,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.decoding_method == "fast_beam_search_nbest_oracle":
        hyp_tokens = fast_beam_search_nbest_oracle(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            num_paths=params.num_paths,
            ref_texts=sp.encode(byte_encode(tokenize_by_ja_char(supervisions["text"]))),
            nbest_scale=params.nbest_scale,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.decoding_method == "modified_beam_search":
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam_size,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    else:
        batch_size = encoder_out.size(0)
        for i in range(batch_size):
            # fmt: off
            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
            # fmt: on
            if params.decoding_method == "greedy_search":
                hyp = greedy_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    max_sym_per_frame=params.max_sym_per_frame,
                )
            elif params.decoding_method == "beam_search":
                hyp = beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            else:
                raise ValueError(
                    f"Unsupported decoding method: {params.decoding_method}"
                )
            hyps.append(smart_byte_decode(sp.decode(hyp)).split())
    if params.decoding_method == "greedy_search":
        return {"greedy_search": hyps}
    elif "fast_beam_search" in params.decoding_method:
        key = f"beam_{params.beam}_"
        key += f"max_contexts_{params.max_contexts}_"
        key += f"max_states_{params.max_states}"
        if "nbest" in params.decoding_method:
            key += f"_num_paths_{params.num_paths}_"
            key += f"nbest_scale_{params.nbest_scale}"
            if "LG" in params.decoding_method:
                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
        return {key: hyps}
    else:
        return {f"beam_size_{params.beam_size}": hyps}
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    word_table: Optional[k2.SymbolTable] = None,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      sp:
        The BPE model.
      word_table:
        The word symbol table.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    if params.decoding_method == "greedy_search":
        log_interval = 50
    else:
        log_interval = 20
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [tokenize_by_ja_char(str(text)).split() for text in texts]
        # print(texts)
        # exit()
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        hyps_dict = decode_one_batch(
            params=params,
            model=model,
            sp=sp,
            decoding_graph=decoding_graph,
            word_table=word_table,
            batch=batch,
        )
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                this_batch.append((cut_id, ref_text, hyp_words))
            results[name].extend(this_batch)
        num_cuts += len(texts)
        if batch_idx % log_interval == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
    return results
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    ReazonSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    assert params.decoding_method in (
        "greedy_search",
        "beam_search",
        "fast_beam_search",
        "fast_beam_search_nbest",
        "fast_beam_search_nbest_LG",
        "fast_beam_search_nbest_oracle",
        "modified_beam_search",
    )
    params.res_dir = params.exp_dir / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    if params.causal:
        assert (
            "," not in params.chunk_size
        ), "chunk_size should be one value in decoding."
        assert (
            "," not in params.left_context_frames
        ), "left_context_frames should be one value in decoding."
        params.suffix += f"-chunk-{params.chunk_size}"
        params.suffix += f"-left-context-{params.left_context_frames}"
    if "fast_beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
        if "nbest" in params.decoding_method:
            params.suffix += f"-nbest-scale-{params.nbest_scale}"
            params.suffix += f"-num-paths-{params.num_paths}"
            if "LG" in params.decoding_method:
                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
    elif "beam_search" in params.decoding_method:
        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> and <unk> are defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to(device)
    model.eval()
    if "fast_beam_search" in params.decoding_method:
        if params.decoding_method == "fast_beam_search_nbest_LG":
            lexicon = Lexicon(params.lang_dir)
            word_table = lexicon.word_table
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
                torch.load(lg_filename, map_location=device)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
            word_table = None
            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    else:
        decoding_graph = None
        word_table = None
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    data_module = ReazonSpeechAsrDataModule(args)
    multi_dataset = MultiDataset(args)
    def remove_short_utt(c: Cut):
        T = ((c.num_frames - 7) // 2 + 1) // 2
        if T <= 0:
            logging.warning(
                f"Excluding cut with ID: {c.id} from decoding, num_frames: {c.num_frames}"
            )
        return T > 0
    test_sets_cuts = multi_dataset.test_cuts()
    test_sets = test_sets_cuts.keys()
    test_dl = [
        data_module.test_dataloaders(test_sets_cuts[cuts_name].filter(remove_short_utt))
        for cuts_name in test_sets
    ]
    for test_set, test_dl in zip(test_sets, test_dl):
        logging.info(f"Start decoding test set: {test_set}")
        results_dict = decode_dataset(
            dl=test_dl,
            params=params,
            model=model,
            sp=sp,
            word_table=word_table,
            decoding_graph=decoding_graph,
        )
        save_results(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/zipformer/decode_stream.py
+++ b/egs/multi_ja_en/ASR/zipformer/decode_stream.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/decode_stream.py
--- a/egs/multi_ja_en/ASR/zipformer/decoder.py
+++ b/egs/multi_ja_en/ASR/zipformer/decoder.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/decoder.py
--- a/egs/multi_ja_en/ASR/zipformer/do_not_use_it_directly.py
+++ b/egs/multi_ja_en/ASR/zipformer/do_not_use_it_directly.py
--- a/egs/multi_ja_en/ASR/zipformer/encoder_interface.py
+++ b/egs/multi_ja_en/ASR/zipformer/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/encoder_interface.py
--- a/egs/multi_ja_en/ASR/zipformer/export-onnx.py
+++ b/egs/multi_ja_en/ASR/zipformer/export-onnx.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/export-onnx.py
--- a/egs/multi_ja_en/ASR/zipformer/export.py
+++ b/egs/multi_ja_en/ASR/zipformer/export.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/export.py
--- a/egs/multi_ja_en/ASR/zipformer/generate_averaged_model.py
+++ b/egs/multi_ja_en/ASR/zipformer/generate_averaged_model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/generate_averaged_model.py
--- a/egs/multi_ja_en/ASR/zipformer/joiner.py
+++ b/egs/multi_ja_en/ASR/zipformer/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/joiner.py
--- a/egs/multi_ja_en/ASR/zipformer/model.py
+++ b/egs/multi_ja_en/ASR/zipformer/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/model.py
--- a/egs/multi_ja_en/ASR/zipformer/multi_dataset.py
+++ b/egs/multi_ja_en/ASR/zipformer/multi_dataset.py
@ -0,0 +1,143 @@
 import argparse
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Dict
 from lhotse import CutSet, load_manifest_lazy
 class MultiDataset:
    def __init__(self, args: argparse.Namespace):
        """
        Args:
          manifest_dir:
            It is expected to contain the following files:
            - reazonspeech_cuts_train.jsonl.gz
            - librispeech_cuts_train-clean-100.jsonl.gz
            - librispeech_cuts_train-clean-360.jsonl.gz
            - librispeech_cuts_train-other-500.jsonl.gz
        """
        self.fbank_dir = Path(args.manifest_dir)
    def train_cuts(self) -> CutSet:
        logging.info("About to get multidataset train cuts")
        logging.info("Loading Reazonspeech in lazy mode")
        reazonspeech_cuts = load_manifest_lazy(
            self.fbank_dir / "reazonspeech_cuts_train.jsonl.gz"
        )
        logging.info("Loading LibriSpeech in lazy mode")
        train_clean_100_cuts = self.train_clean_100_cuts()
        train_clean_360_cuts = self.train_clean_360_cuts()
        train_other_500_cuts = self.train_other_500_cuts()
        return CutSet.mux(
            reazonspeech_cuts,
            train_clean_100_cuts,
            train_clean_360_cuts,
            train_other_500_cuts,
            weights=[
                len(reazonspeech_cuts),
                len(train_clean_100_cuts),
                len(train_clean_360_cuts),
                len(train_other_500_cuts),
            ],
        )
    def dev_cuts(self) -> CutSet:
        logging.info("About to get multidataset dev cuts")
        logging.info("Loading Reazonspeech DEV set in lazy mode")
        reazonspeech_dev_cuts = load_manifest_lazy(
            self.fbank_dir / "reazonspeech_cuts_dev.jsonl.gz"
        )
        logging.info("Loading LibriSpeech DEV set in lazy mode")
        dev_clean_cuts = self.dev_clean_cuts()
        dev_other_cuts = self.dev_other_cuts()
        return CutSet.mux(
            reazonspeech_dev_cuts,
            dev_clean_cuts,
            dev_other_cuts,
            weights=[
                len(reazonspeech_dev_cuts),
                len(dev_clean_cuts),
                len(dev_other_cuts),
            ],
        )
    def test_cuts(self) -> Dict[str, CutSet]:
        logging.info("About to get multidataset test cuts")
        logging.info("Loading Reazonspeech set in lazy mode")
        reazonspeech_test_cuts = load_manifest_lazy(
            self.fbank_dir / "reazonspeech_cuts_test.jsonl.gz"
        )
        reazonspeech_dev_cuts = load_manifest_lazy(
            self.fbank_dir / "reazonspeech_cuts_dev.jsonl.gz"
        )
        logging.info("Loading LibriSpeech set in lazy mode")
        test_clean_cuts = self.test_clean_cuts()
        test_other_cuts = self.test_other_cuts()
        test_cuts = {
            "reazonspeech_test": reazonspeech_test_cuts,
            "reazonspeech_dev": reazonspeech_dev_cuts,
            "librispeech_test_clean": test_clean_cuts,
            "librispeech_test_other": test_other_cuts,
        }
        return test_cuts
    @lru_cache()
    def train_clean_100_cuts(self) -> CutSet:
        logging.info("About to get train-clean-100 cuts")
        return load_manifest_lazy(
            self.fbank_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
        )
    @lru_cache()
    def train_clean_360_cuts(self) -> CutSet:
        logging.info("About to get train-clean-360 cuts")
        return load_manifest_lazy(
            self.fbank_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
        )
    @lru_cache()
    def train_other_500_cuts(self) -> CutSet:
        logging.info("About to get train-other-500 cuts")
        return load_manifest_lazy(
            self.fbank_dir / "librispeech_cuts_train-other-500.jsonl.gz"
        )
    @lru_cache()
    def dev_clean_cuts(self) -> CutSet:
        logging.info("About to get dev-clean cuts")
        return load_manifest_lazy(
            self.fbank_dir / "librispeech_cuts_dev-clean.jsonl.gz"
        )
    @lru_cache()
    def dev_other_cuts(self) -> CutSet:
        logging.info("About to get dev-other cuts")
        return load_manifest_lazy(
            self.fbank_dir / "librispeech_cuts_dev-other.jsonl.gz"
        )
    @lru_cache()
    def test_clean_cuts(self) -> CutSet:
        logging.info("About to get test-clean cuts")
        return load_manifest_lazy(
            self.fbank_dir / "librispeech_cuts_test-clean.jsonl.gz"
        )
    @lru_cache()
    def test_other_cuts(self) -> CutSet:
        logging.info("About to get test-other cuts")
        return load_manifest_lazy(
            self.fbank_dir / "librispeech_cuts_test-other.jsonl.gz"
        )
--- a/egs/multi_ja_en/ASR/zipformer/my_profile.py
+++ b/egs/multi_ja_en/ASR/zipformer/my_profile.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/my_profile.py
--- a/egs/multi_ja_en/ASR/zipformer/onnx_decode.py
+++ b/egs/multi_ja_en/ASR/zipformer/onnx_decode.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/onnx_decode.py
--- a/egs/multi_ja_en/ASR/zipformer/onnx_pretrained.py
+++ b/egs/multi_ja_en/ASR/zipformer/onnx_pretrained.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/onnx_pretrained.py
--- a/egs/multi_ja_en/ASR/zipformer/optim.py
+++ b/egs/multi_ja_en/ASR/zipformer/optim.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/optim.py
--- a/egs/multi_ja_en/ASR/zipformer/pretrained.py
+++ b/egs/multi_ja_en/ASR/zipformer/pretrained.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/pretrained.py
--- a/egs/multi_ja_en/ASR/zipformer/scaling.py
+++ b/egs/multi_ja_en/ASR/zipformer/scaling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling.py
--- a/egs/multi_ja_en/ASR/zipformer/scaling_converter.py
+++ b/egs/multi_ja_en/ASR/zipformer/scaling_converter.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling_converter.py
--- a/egs/multi_ja_en/ASR/zipformer/streaming_beam_search.py
+++ b/egs/multi_ja_en/ASR/zipformer/streaming_beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/streaming_beam_search.py
--- a/egs/multi_ja_en/ASR/zipformer/streaming_decode.py
+++ b/egs/multi_ja_en/ASR/zipformer/streaming_decode.py
@ -0,0 +1,935 @@
 #!/usr/bin/env python3
 # Copyright 2022-2023 Xiaomi Corporation (Authors: Wei Kang,
 #                                                  Fangjun Kuang,
 #                                                  Zengwei Yao)
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 Monolingual:
 ./zipformer/streaming_decode.py \
    --epoch 28 \
     --avg 15 \
    --causal 1 \
    --chunk-size 32 \
    --left-context-frames 256 \
    --exp-dir ./zipformer/exp-large \
    --lang data/lang_char \
    --num-encoder-layers 2,2,4,5,4,2 \
    --feedforward-dim 512,768,1536,2048,1536,768 \
    --encoder-dim 192,256,512,768,512,256 \
    --encoder-unmasked-dim 192,192,256,320,256,192
 Bilingual:
 ./zipformer/streaming_decode.py \
    --bilingual 1 \
    --epoch 28 \
     --avg 15 \
    --causal 1 \
    --chunk-size 32 \
    --left-context-frames 256 \
    --exp-dir ./zipformer/exp-large \
    --lang data/lang_char \
    --num-encoder-layers 2,2,4,5,4,2 \
    --feedforward-dim 512,768,1536,2048,1536,768 \
    --encoder-dim 192,256,512,768,512,256 \
    --encoder-unmasked-dim 192,192,256,320,256,192 \
 """
 import argparse
 import logging
 import math
 import os
 import pdb
 import subprocess as sp
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import numpy as np
 import sentencepiece as spm
 import torch
 from asr_datamodule import ReazonSpeechAsrDataModule
 from decode_stream import DecodeStream
 from kaldifeat import Fbank, FbankOptions
 from lhotse import CutSet
 from lhotse.cut import Cut
 from multi_dataset import MultiDataset
 from streaming_beam_search import (
    fast_beam_search_one_best,
    greedy_search,
    modified_beam_search,
 )
 from tokenizer import Tokenizer
 from torch import Tensor, nn
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_model, get_params
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import (
    AttributeDict,
    make_pad_mask,
    setup_logger,
    store_transcripts,
    str2bool,
    write_error_stats,
 )
 LOG_EPS = math.log(1e-10)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--bilingual",
        type=str2bool,
        default=False,
        help="Whether the model is bilingual or not. 1 = bilingual.",
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=28,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="zipformer/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--lang-dir",
        type=Path,
        default="data/lang_char",
        help="The lang dir containing word table and LG graph",
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Supported decoding methods are:
        greedy_search
        modified_beam_search
        fast_beam_search
        """,
    )
    parser.add_argument(
        "--num_active_paths",
        type=int,
        default=4,
        help="""An interger indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=4,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=32,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
        "--num-decode-streams",
        type=int,
        default=2000,
        help="The number of streams that can be decoded parallel.",
    )
    add_model_arguments(parser)
    return parser
 def get_init_states(
    model: nn.Module,
    batch_size: int = 1,
    device: torch.device = torch.device("cpu"),
 ) -> List[torch.Tensor]:
    """
    Returns a list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
    is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
    states[-2] is the cached left padding for ConvNeXt module,
    of shape (batch_size, num_channels, left_pad, num_freqs)
    states[-1] is processed_lens of shape (batch,), which records the number
    of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    """
    states = model.encoder.get_init_states(batch_size, device)
    embed_states = model.encoder_embed.get_init_states(batch_size, device)
    states.append(embed_states)
    processed_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
    states.append(processed_lens)
    return states
 def stack_states(state_list: List[List[torch.Tensor]]) -> List[torch.Tensor]:
    """Stack list of zipformer states that correspond to separate utterances
    into a single emformer state, so that it can be used as an input for
    zipformer when those utterances are formed into a batch.
    Args:
      state_list:
        Each element in state_list corresponding to the internal state
        of the zipformer model for a single utterance. For element-n,
        state_list[n] is a list of cached tensors of all encoder layers. For layer-i,
        state_list[n][i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1,
        cached_val2, cached_conv1, cached_conv2).
        state_list[n][-2] is the cached left padding for ConvNeXt module,
          of shape (batch_size, num_channels, left_pad, num_freqs)
        state_list[n][-1] is processed_lens of shape (batch,), which records the number
        of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    Note:
      It is the inverse of :func:`unstack_states`.
    """
    batch_size = len(state_list)
    assert (len(state_list[0]) - 2) % 6 == 0, len(state_list[0])
    tot_num_layers = (len(state_list[0]) - 2) // 6
    batch_states = []
    for layer in range(tot_num_layers):
        layer_offset = layer * 6
        # cached_key: (left_context_len, batch_size, key_dim)
        cached_key = torch.cat(
            [state_list[i][layer_offset] for i in range(batch_size)], dim=1
        )
        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
        cached_nonlin_attn = torch.cat(
            [state_list[i][layer_offset + 1] for i in range(batch_size)], dim=1
        )
        # cached_val1: (left_context_len, batch_size, value_dim)
        cached_val1 = torch.cat(
            [state_list[i][layer_offset + 2] for i in range(batch_size)], dim=1
        )
        # cached_val2: (left_context_len, batch_size, value_dim)
        cached_val2 = torch.cat(
            [state_list[i][layer_offset + 3] for i in range(batch_size)], dim=1
        )
        # cached_conv1: (#batch, channels, left_pad)
        cached_conv1 = torch.cat(
            [state_list[i][layer_offset + 4] for i in range(batch_size)], dim=0
        )
        # cached_conv2: (#batch, channels, left_pad)
        cached_conv2 = torch.cat(
            [state_list[i][layer_offset + 5] for i in range(batch_size)], dim=0
        )
        batch_states += [
            cached_key,
            cached_nonlin_attn,
            cached_val1,
            cached_val2,
            cached_conv1,
            cached_conv2,
        ]
    cached_embed_left_pad = torch.cat(
        [state_list[i][-2] for i in range(batch_size)], dim=0
    )
    batch_states.append(cached_embed_left_pad)
    processed_lens = torch.cat([state_list[i][-1] for i in range(batch_size)], dim=0)
    batch_states.append(processed_lens)
    return batch_states
 def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
    """Unstack the zipformer state corresponding to a batch of utterances
    into a list of states, where the i-th entry is the state from the i-th
    utterance in the batch.
    Note:
      It is the inverse of :func:`stack_states`.
    Args:
        batch_states: A list of cached tensors of all encoder layers. For layer-i,
          states[i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1, cached_val2,
          cached_conv1, cached_conv2).
          state_list[-2] is the cached left padding for ConvNeXt module,
          of shape (batch_size, num_channels, left_pad, num_freqs)
          states[-1] is processed_lens of shape (batch,), which records the number
          of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    Returns:
        state_list: A list of list. Each element in state_list corresponding to the internal state
        of the zipformer model for a single utterance.
    """
    assert (len(batch_states) - 2) % 6 == 0, len(batch_states)
    tot_num_layers = (len(batch_states) - 2) // 6
    processed_lens = batch_states[-1]
    batch_size = processed_lens.shape[0]
    state_list = [[] for _ in range(batch_size)]
    for layer in range(tot_num_layers):
        layer_offset = layer * 6
        # cached_key: (left_context_len, batch_size, key_dim)
        cached_key_list = batch_states[layer_offset].chunk(chunks=batch_size, dim=1)
        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
        cached_nonlin_attn_list = batch_states[layer_offset + 1].chunk(
            chunks=batch_size, dim=1
        )
        # cached_val1: (left_context_len, batch_size, value_dim)
        cached_val1_list = batch_states[layer_offset + 2].chunk(
            chunks=batch_size, dim=1
        )
        # cached_val2: (left_context_len, batch_size, value_dim)
        cached_val2_list = batch_states[layer_offset + 3].chunk(
            chunks=batch_size, dim=1
        )
        # cached_conv1: (#batch, channels, left_pad)
        cached_conv1_list = batch_states[layer_offset + 4].chunk(
            chunks=batch_size, dim=0
        )
        # cached_conv2: (#batch, channels, left_pad)
        cached_conv2_list = batch_states[layer_offset + 5].chunk(
            chunks=batch_size, dim=0
        )
        for i in range(batch_size):
            state_list[i] += [
                cached_key_list[i],
                cached_nonlin_attn_list[i],
                cached_val1_list[i],
                cached_val2_list[i],
                cached_conv1_list[i],
                cached_conv2_list[i],
            ]
    cached_embed_left_pad_list = batch_states[-2].chunk(chunks=batch_size, dim=0)
    for i in range(batch_size):
        state_list[i].append(cached_embed_left_pad_list[i])
    processed_lens_list = batch_states[-1].chunk(chunks=batch_size, dim=0)
    for i in range(batch_size):
        state_list[i].append(processed_lens_list[i])
    return state_list
 def streaming_forward(
    features: Tensor,
    feature_lens: Tensor,
    model: nn.Module,
    states: List[Tensor],
    chunk_size: int,
    left_context_len: int,
 ) -> Tuple[Tensor, Tensor, List[Tensor]]:
    """
    Returns encoder outputs, output lengths, and updated states.
    """
    cached_embed_left_pad = states[-2]
    (x, x_lens, new_cached_embed_left_pad,) = model.encoder_embed.streaming_forward(
        x=features,
        x_lens=feature_lens,
        cached_left_pad=cached_embed_left_pad,
    )
    assert x.size(1) == chunk_size, (x.size(1), chunk_size)
    src_key_padding_mask = make_pad_mask(x_lens)
    # processed_mask is used to mask out initial states
    processed_mask = torch.arange(left_context_len, device=x.device).expand(
        x.size(0), left_context_len
    )
    processed_lens = states[-1]  # (batch,)
    # (batch, left_context_size)
    processed_mask = (processed_lens.unsqueeze(1) <= processed_mask).flip(1)
    # Update processed lengths
    new_processed_lens = processed_lens + x_lens
    # (batch, left_context_size + chunk_size)
    src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)
    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
    encoder_states = states[:-2]
    (
        encoder_out,
        encoder_out_lens,
        new_encoder_states,
    ) = model.encoder.streaming_forward(
        x=x,
        x_lens=x_lens,
        states=encoder_states,
        src_key_padding_mask=src_key_padding_mask,
    )
    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
    new_states = new_encoder_states + [
        new_cached_embed_left_pad,
        new_processed_lens,
    ]
    return encoder_out, encoder_out_lens, new_states
 def decode_one_chunk(
    params: AttributeDict,
    model: nn.Module,
    decode_streams: List[DecodeStream],
 ) -> List[int]:
    """Decode one chunk frames of features for each decode_streams and
    return the indexes of finished streams in a List.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      decode_streams:
        A List of DecodeStream, each belonging to a utterance.
    Returns:
      Return a List containing which DecodeStreams are finished.
    """
    chunk_size = int(params.chunk_size)
    left_context_len = int(params.left_context_frames)
    features = []
    feature_lens = []
    states = []
    processed_lens = []  # Used in fast-beam-search
    for stream in decode_streams:
        feat, feat_len = stream.get_feature_frames(chunk_size * 2)
        features.append(feat)
        feature_lens.append(feat_len)
        states.append(stream.states)
        processed_lens.append(stream.done_frames)
    feature_lens = torch.tensor(feature_lens, device=model.device)
    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
    # Make sure the length after encoder_embed is at least 1.
    # The encoder_embed subsample features (T - 7) // 2
    # The ConvNeXt module needs (7 - 1) // 2 = 3 frames of right padding after subsampling
    tail_length = chunk_size * 2 + 7 + 2 * 3
    if features.size(1) < tail_length:
        pad_length = tail_length - features.size(1)
        feature_lens += pad_length
        features = torch.nn.functional.pad(
            features,
            (0, 0, 0, pad_length),
            mode="constant",
            value=LOG_EPS,
        )
    states = stack_states(states)
    encoder_out, encoder_out_lens, new_states = streaming_forward(
        features=features,
        feature_lens=feature_lens,
        model=model,
        states=states,
        chunk_size=chunk_size,
        left_context_len=left_context_len,
    )
    encoder_out = model.joiner.encoder_proj(encoder_out)
    if params.decoding_method == "greedy_search":
        greedy_search(model=model, encoder_out=encoder_out, streams=decode_streams)
    elif params.decoding_method == "fast_beam_search":
        processed_lens = torch.tensor(processed_lens, device=model.device)
        processed_lens = processed_lens + encoder_out_lens
        fast_beam_search_one_best(
            model=model,
            encoder_out=encoder_out,
            processed_lens=processed_lens,
            streams=decode_streams,
            beam=params.beam,
            max_states=params.max_states,
            max_contexts=params.max_contexts,
        )
    elif params.decoding_method == "modified_beam_search":
        modified_beam_search(
            model=model,
            streams=decode_streams,
            encoder_out=encoder_out,
            num_active_paths=params.num_active_paths,
        )
    else:
        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    states = unstack_states(new_states)
    finished_streams = []
    for i in range(len(decode_streams)):
        decode_streams[i].states = states[i]
        decode_streams[i].done_frames += encoder_out_lens[i]
        if decode_streams[i].done:
            finished_streams.append(i)
        # finished_streams.append(i)
    return finished_streams
 def decode_dataset(
    cuts: CutSet,
    params: AttributeDict,
    model: nn.Module,
    sp: Tokenizer,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      cuts:
        Lhotse Cutset containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      sp:
        The BPE model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    device = model.device
    opts = FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = 16000
    opts.mel_opts.num_bins = 80
    log_interval = 100
    decode_results = []
    # Contain decode streams currently running.
    decode_streams = []
    for num, cut in enumerate(cuts):
        # each utterance has a DecodeStream.
        initial_states = get_init_states(model=model, batch_size=1, device=device)
        decode_stream = DecodeStream(
            params=params,
            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
        )
        audio: np.ndarray = cut.load_audio()
        # audio.shape: (1, num_samples)
        assert len(audio.shape) == 2
        assert audio.shape[0] == 1, "Should be single channel"
        assert audio.dtype == np.float32, audio.dtype
        # The trained model is using normalized samples
        # - this is to avoid sending [-32k,+32k] signal in...
        # - some lhotse AudioTransform classes can make the signal
        #   be out of range [-1, 1], hence the tolerance 10
        assert (
            np.abs(audio).max() <= 10
        ), "Should be normalized to [-1, 1], 10 for tolerance..."
        samples = torch.from_numpy(audio).squeeze(0)
        fbank = Fbank(opts)
        feature = fbank(samples.to(device))
        decode_stream.set_features(feature, tail_pad_len=30)
        decode_stream.ground_truth = cut.supervisions[0].text
        decode_streams.append(decode_stream)
        while len(decode_streams) >= params.num_decode_streams:
            finished_streams = decode_one_chunk(
                params=params, model=model, decode_streams=decode_streams
            )
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        sp.decode(decode_streams[i].decoding_result()).split(),
                    )
                )
                del decode_streams[i]
        if num % log_interval == 0:
            logging.info(f"Cuts processed until now is {num}.")
    # decode final chunks of last sequences
    while len(decode_streams):
        finished_streams = decode_one_chunk(
            params=params, model=model, decode_streams=decode_streams
        )
        if not finished_streams:
            print("No finished streams, breaking the loop")
            break
        for i in sorted(finished_streams, reverse=True):
            try:
                decode_results.append(
                    (
                        decode_streams[i].id,
                        decode_streams[i].ground_truth.split(),
                        sp.decode(decode_streams[i].decoding_result()).split(),
                    )
                )
                del decode_streams[i]
            except IndexError as e:
                print(f"IndexError: {e}")
                print(f"decode_streams length: {len(decode_streams)}")
                print(f"finished_streams: {finished_streams}")
                print(f"i: {i}")
                continue
    if params.decoding_method == "greedy_search":
        key = "greedy_search"
    elif params.decoding_method == "fast_beam_search":
        key = (
            f"beam_{params.beam}_"
            f"max_contexts_{params.max_contexts}_"
            f"max_states_{params.max_states}"
        )
    elif params.decoding_method == "modified_beam_search":
        key = f"num_active_paths_{params.num_active_paths}"
    else:
        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    torch.cuda.synchronize()
    return {key: decode_results}
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    ReazonSpeechAsrDataModule.add_arguments(parser)
    Tokenizer.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    assert params.causal, params.causal
    assert "," not in params.chunk_size, "chunk_size should be one value in decoding."
    assert (
        "," not in params.left_context_frames
    ), "left_context_frames should be one value in decoding."
    params.suffix += f"-chunk-{params.chunk_size}"
    params.suffix += f"-left-context-{params.left_context_frames}"
    # for fast_beam_search
    if params.decoding_method == "fast_beam_search":
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    if not params.bilingual:
        sp = Tokenizer.load(params.lang, params.lang_type)
    else:
        sp = spm.SentencePieceProcessor()
        sp.load(params.bpe_model)
    # <blk> and <unk> is defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if start >= 0:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to(device)
    model.eval()
    model.device = device
    decoding_graph = None
    if params.decoding_method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    reazonspeech_corpus = ReazonSpeechAsrDataModule(args)
    if params.bilingual:
        multi_dataset = MultiDataset(args)
        def remove_short_utt(c: Cut):
            T = ((c.num_frames - 7) // 2 + 1) // 2
            if T <= 0:
                logging.warning(
                    f"Excluding cut with ID: {c.id} from decoding, num_frames: {c.num_frames}"
                )
            return T > 0
        test_sets_cuts = multi_dataset.test_cuts()
        test_sets = test_sets_cuts.keys()
        test_cuts = [test_sets_cuts[k] for k in test_sets]
    valid_cuts = reazonspeech_corpus.valid_cuts()
    test_cuts = reazonspeech_corpus.test_cuts()
    test_sets = ["valid", "test"]
    test_cuts = [valid_cuts, test_cuts]
    for test_set, test_cut in zip(test_sets, test_cuts):
        logging.info(f"Decoding {test_set}")
        if params.bilingual:
            test_cut = test_cut.filter(remove_short_utt)
        results_dict = decode_dataset(
            cuts=test_cut,
            params=params,
            model=model,
            sp=sp,
            decoding_graph=decoding_graph,
        )
        save_results(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/multi_ja_en/ASR/zipformer/subsampling.py
+++ b/egs/multi_ja_en/ASR/zipformer/subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/subsampling.py
--- a/egs/multi_ja_en/ASR/zipformer/test_scaling.py
+++ b/egs/multi_ja_en/ASR/zipformer/test_scaling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/test_scaling.py
--- a/egs/multi_ja_en/ASR/zipformer/test_subsampling.py
+++ b/egs/multi_ja_en/ASR/zipformer/test_subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/test_subsampling.py
--- a/egs/multi_ja_en/ASR/zipformer/tokenizer.py
+++ b/egs/multi_ja_en/ASR/zipformer/tokenizer.py
@ -0,0 +1 @@
 ../local/utils/tokenizer.py
--- a/egs/multi_ja_en/ASR/zipformer/train.py
+++ b/egs/multi_ja_en/ASR/zipformer/train.py
--- a/egs/multi_ja_en/ASR/zipformer/zipformer.py
+++ b/egs/multi_ja_en/ASR/zipformer/zipformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/zipformer.py
--- a/egs/reazonspeech/ASR/RESULTS.md
+++ b/egs/reazonspeech/ASR/RESULTS.md
@ -47,3 +47,41 @@ The decoding command is:
    --blank-penalty 0
 ```
 #### Streaming
 We have not completed evaluation of our models yet and will add evaluation results here once it's completed.
 The training command is:
 ```shell
 ./zipformer/train.py \
  --world-size 8 \
  --num-epochs 40 \
  --start-epoch 1 \
  --use-fp16 1 \
  --exp-dir zipformer/exp-large \
  --causal 1 \
  --num-encoder-layers 2,2,4,5,4,2 \
  --feedforward-dim 512,768,1536,2048,1536,768 \
  --encoder-dim 192,256,512,768,512,256 \
  --encoder-unmasked-dim 192,192,256,320,256,192 \
  --lang data/lang_char \
  --max-duration 1600
 ```
 The decoding command is:
 ```shell
 ./zipformer/streaming_decode.py \
  --epoch 28 \
  --avg 15 \
  --causal 1 \
  --chunk-size 32 \
  --left-context-frames 256 \
  --exp-dir ./zipformer/exp-large \
  --lang data/lang_char \
  --num-encoder-layers 2,2,4,5,4,2 \
  --feedforward-dim 512,768,1536,2048,1536,768 \
  --encoder-dim 192,256,512,768,512,256 \
  --encoder-unmasked-dim 192,192,256,320,256,192
 ```
--- a/egs/reazonspeech/ASR/local/utils/tokenizer.py
+++ b/egs/reazonspeech/ASR/local/utils/tokenizer.py
@ -12,7 +12,6 @@ class Tokenizer:
    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser):
        group = parser.add_argument_group(title="Lang related options")
        group.add_argument("--lang", type=Path, help="Path to lang directory.")
        group.add_argument(
--- a/egs/reazonspeech/ASR/zipformer/streaming_decode.py
+++ b/egs/reazonspeech/ASR/zipformer/streaming_decode.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
-# Copyright 2022 Xiaomi Corporation (Authors: Wei Kang, Fangjun Kuang)
+# Copyright 2022-2023 Xiaomi Corporation (Authors: Wei Kang,
-#
+#                                                  Fangjun Kuang,
 #                                                  Zengwei Yao)
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -17,28 +18,23 @@
 """
 Usage:
-./pruned_transducer_stateless7_streaming/streaming_decode.py \
+./zipformer/streaming_decode.py--epoch 28   --avg 15   --causal 1   --chunk-size 32   --left-context-frames 256   --exp-dir ./zipformer/exp-large --lang data/lang_char --num-encoder-layers 2,2,4,5,4,2 --feedforward-dim 512,768,1536,2048,1536,768 --encoder-dim 192,256,512,768,512,256 --encoder-unmasked-dim 192,192,256,320,256,192
-  --epoch 28 \
+
  --avg 15 \
  --decode-chunk-len 32 \
  --exp-dir ./pruned_transducer_stateless7_streaming/exp \
  --decoding_method greedy_search \
  --lang data/lang_char \
  --num-decode-streams 2000
 """
 import argparse
 import logging
 import math
 import os
 import pdb
 import subprocess as sp
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import numpy as np
 import torch
 import torch.nn as nn
 from asr_datamodule import ReazonSpeechAsrDataModule
 from decode import save_results
 from decode_stream import DecodeStream
 from kaldifeat import Fbank, FbankOptions
 from lhotse import CutSet
@ -48,9 +44,9 @@ from streaming_beam_search import (
    modified_beam_search,
 )
 from tokenizer import Tokenizer
 from torch import Tensor, nn
 from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_transducer_model
+from train import add_model_arguments, get_model, get_params
 from zipformer import stack_states, unstack_states
 from icefall.checkpoint import (
    average_checkpoints,
@ -58,7 +54,14 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.utils import AttributeDict, setup_logger, str2bool
+from icefall.utils import (
    AttributeDict,
    make_pad_mask,
    setup_logger,
    store_transcripts,
    str2bool,
    write_error_stats,
 )
 LOG_EPS = math.log(1e-10)
@ -73,7 +76,7 @@ def get_parser():
        type=int,
        default=28,
        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 0.
+        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
@ -87,12 +90,6 @@ def get_parser():
        """,
    )
    parser.add_argument(
        "--gpu",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--avg",
        type=int,
@ -116,7 +113,7 @@ def get_parser():
    parser.add_argument(
        "--exp-dir",
        type=str,
-        default="pruned_transducer_stateless2/exp",
+        default="zipformer/exp",
        help="The experiment dir",
    )
@ -127,6 +124,13 @@ def get_parser():
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--lang-dir",
        type=Path,
        default="data/lang_char",
        help="The lang dir containing word table and LG graph",
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
@ -138,14 +142,6 @@ def get_parser():
        """,
    )
    parser.add_argument(
        "--decoding-graph",
        type=str,
        default="",
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--num_active_paths",
        type=int,
@ -157,7 +153,7 @@ def get_parser():
    parser.add_argument(
        "--beam",
        type=float,
-        default=4.0,
+        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
@ -194,18 +190,235 @@ def get_parser():
        help="The number of streams that can be decoded parallel.",
    )
    parser.add_argument(
        "--res-dir",
        type=Path,
        default=None,
        help="The path to save results.",
    )
    add_model_arguments(parser)
    return parser
 def get_init_states(
    model: nn.Module,
    batch_size: int = 1,
    device: torch.device = torch.device("cpu"),
 ) -> List[torch.Tensor]:
    """
    Returns a list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
    is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
    states[-2] is the cached left padding for ConvNeXt module,
    of shape (batch_size, num_channels, left_pad, num_freqs)
    states[-1] is processed_lens of shape (batch,), which records the number
    of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    """
    states = model.encoder.get_init_states(batch_size, device)
    embed_states = model.encoder_embed.get_init_states(batch_size, device)
    states.append(embed_states)
    processed_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
    states.append(processed_lens)
    return states
 def stack_states(state_list: List[List[torch.Tensor]]) -> List[torch.Tensor]:
    """Stack list of zipformer states that correspond to separate utterances
    into a single emformer state, so that it can be used as an input for
    zipformer when those utterances are formed into a batch.
    Args:
      state_list:
        Each element in state_list corresponding to the internal state
        of the zipformer model for a single utterance. For element-n,
        state_list[n] is a list of cached tensors of all encoder layers. For layer-i,
        state_list[n][i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1,
        cached_val2, cached_conv1, cached_conv2).
        state_list[n][-2] is the cached left padding for ConvNeXt module,
          of shape (batch_size, num_channels, left_pad, num_freqs)
        state_list[n][-1] is processed_lens of shape (batch,), which records the number
        of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    Note:
      It is the inverse of :func:`unstack_states`.
    """
    batch_size = len(state_list)
    assert (len(state_list[0]) - 2) % 6 == 0, len(state_list[0])
    tot_num_layers = (len(state_list[0]) - 2) // 6
    batch_states = []
    for layer in range(tot_num_layers):
        layer_offset = layer * 6
        # cached_key: (left_context_len, batch_size, key_dim)
        cached_key = torch.cat(
            [state_list[i][layer_offset] for i in range(batch_size)], dim=1
        )
        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
        cached_nonlin_attn = torch.cat(
            [state_list[i][layer_offset + 1] for i in range(batch_size)], dim=1
        )
        # cached_val1: (left_context_len, batch_size, value_dim)
        cached_val1 = torch.cat(
            [state_list[i][layer_offset + 2] for i in range(batch_size)], dim=1
        )
        # cached_val2: (left_context_len, batch_size, value_dim)
        cached_val2 = torch.cat(
            [state_list[i][layer_offset + 3] for i in range(batch_size)], dim=1
        )
        # cached_conv1: (#batch, channels, left_pad)
        cached_conv1 = torch.cat(
            [state_list[i][layer_offset + 4] for i in range(batch_size)], dim=0
        )
        # cached_conv2: (#batch, channels, left_pad)
        cached_conv2 = torch.cat(
            [state_list[i][layer_offset + 5] for i in range(batch_size)], dim=0
        )
        batch_states += [
            cached_key,
            cached_nonlin_attn,
            cached_val1,
            cached_val2,
            cached_conv1,
            cached_conv2,
        ]
    cached_embed_left_pad = torch.cat(
        [state_list[i][-2] for i in range(batch_size)], dim=0
    )
    batch_states.append(cached_embed_left_pad)
    processed_lens = torch.cat([state_list[i][-1] for i in range(batch_size)], dim=0)
    batch_states.append(processed_lens)
    return batch_states
 def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
    """Unstack the zipformer state corresponding to a batch of utterances
    into a list of states, where the i-th entry is the state from the i-th
    utterance in the batch.
    Note:
      It is the inverse of :func:`stack_states`.
    Args:
        batch_states: A list of cached tensors of all encoder layers. For layer-i,
          states[i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1, cached_val2,
          cached_conv1, cached_conv2).
          state_list[-2] is the cached left padding for ConvNeXt module,
          of shape (batch_size, num_channels, left_pad, num_freqs)
          states[-1] is processed_lens of shape (batch,), which records the number
          of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
    Returns:
        state_list: A list of list. Each element in state_list corresponding to the internal state
        of the zipformer model for a single utterance.
    """
    assert (len(batch_states) - 2) % 6 == 0, len(batch_states)
    tot_num_layers = (len(batch_states) - 2) // 6
    processed_lens = batch_states[-1]
    batch_size = processed_lens.shape[0]
    state_list = [[] for _ in range(batch_size)]
    for layer in range(tot_num_layers):
        layer_offset = layer * 6
        # cached_key: (left_context_len, batch_size, key_dim)
        cached_key_list = batch_states[layer_offset].chunk(chunks=batch_size, dim=1)
        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
        cached_nonlin_attn_list = batch_states[layer_offset + 1].chunk(
            chunks=batch_size, dim=1
        )
        # cached_val1: (left_context_len, batch_size, value_dim)
        cached_val1_list = batch_states[layer_offset + 2].chunk(
            chunks=batch_size, dim=1
        )
        # cached_val2: (left_context_len, batch_size, value_dim)
        cached_val2_list = batch_states[layer_offset + 3].chunk(
            chunks=batch_size, dim=1
        )
        # cached_conv1: (#batch, channels, left_pad)
        cached_conv1_list = batch_states[layer_offset + 4].chunk(
            chunks=batch_size, dim=0
        )
        # cached_conv2: (#batch, channels, left_pad)
        cached_conv2_list = batch_states[layer_offset + 5].chunk(
            chunks=batch_size, dim=0
        )
        for i in range(batch_size):
            state_list[i] += [
                cached_key_list[i],
                cached_nonlin_attn_list[i],
                cached_val1_list[i],
                cached_val2_list[i],
                cached_conv1_list[i],
                cached_conv2_list[i],
            ]
    cached_embed_left_pad_list = batch_states[-2].chunk(chunks=batch_size, dim=0)
    for i in range(batch_size):
        state_list[i].append(cached_embed_left_pad_list[i])
    processed_lens_list = batch_states[-1].chunk(chunks=batch_size, dim=0)
    for i in range(batch_size):
        state_list[i].append(processed_lens_list[i])
    return state_list
 def streaming_forward(
    features: Tensor,
    feature_lens: Tensor,
    model: nn.Module,
    states: List[Tensor],
    chunk_size: int,
    left_context_len: int,
 ) -> Tuple[Tensor, Tensor, List[Tensor]]:
    """
    Returns encoder outputs, output lengths, and updated states.
    """
    cached_embed_left_pad = states[-2]
    (x, x_lens, new_cached_embed_left_pad,) = model.encoder_embed.streaming_forward(
        x=features,
        x_lens=feature_lens,
        cached_left_pad=cached_embed_left_pad,
    )
    assert x.size(1) == chunk_size, (x.size(1), chunk_size)
    src_key_padding_mask = make_pad_mask(x_lens)
    # processed_mask is used to mask out initial states
    processed_mask = torch.arange(left_context_len, device=x.device).expand(
        x.size(0), left_context_len
    )
    processed_lens = states[-1]  # (batch,)
    # (batch, left_context_size)
    processed_mask = (processed_lens.unsqueeze(1) <= processed_mask).flip(1)
    # Update processed lengths
    new_processed_lens = processed_lens + x_lens
    # (batch, left_context_size + chunk_size)
    src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)
    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
    encoder_states = states[:-2]
    (
        encoder_out,
        encoder_out_lens,
        new_encoder_states,
    ) = model.encoder.streaming_forward(
        x=x,
        x_lens=x_lens,
        states=encoder_states,
        src_key_padding_mask=src_key_padding_mask,
    )
    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
    new_states = new_encoder_states + [
        new_cached_embed_left_pad,
        new_processed_lens,
    ]
    return encoder_out, encoder_out_lens, new_states
 def decode_one_chunk(
    params: AttributeDict,
    model: nn.Module,
@ -224,27 +437,32 @@ def decode_one_chunk(
    Returns:
      Return a List containing which DecodeStreams are finished.
    """
-    device = model.device
+    # pdb.set_trace()
    # print(model)
    # print(model.device)
    # device = model.device
    chunk_size = int(params.chunk_size)
    left_context_len = int(params.left_context_frames)
    features = []
    feature_lens = []
    states = []
-    processed_lens = []
+    processed_lens = []  # Used in fast-beam-search
    for stream in decode_streams:
-        feat, feat_len = stream.get_feature_frames(params.decode_chunk_len)
+        feat, feat_len = stream.get_feature_frames(chunk_size * 2)
        features.append(feat)
        feature_lens.append(feat_len)
        states.append(stream.states)
        processed_lens.append(stream.done_frames)
-    feature_lens = torch.tensor(feature_lens, device=device)
+    feature_lens = torch.tensor(feature_lens, device=model.device)
    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
-    # We subsample features with ((x_len - 7) // 2 + 1) // 2 and the max downsampling
+    # Make sure the length after encoder_embed is at least 1.
-    # factor in encoders is 8.
+    # The encoder_embed subsample features (T - 7) // 2
-    # After feature embedding (x_len - 7) // 2, we have (23 - 7) // 2 = 8.
+    # The ConvNeXt module needs (7 - 1) // 2 = 3 frames of right padding after subsampling
-    tail_length = 23
+    tail_length = chunk_size * 2 + 7 + 2 * 3
    if features.size(1) < tail_length:
        pad_length = tail_length - features.size(1)
        feature_lens += pad_length
@ -256,12 +474,14 @@ def decode_one_chunk(
        )
    states = stack_states(states)
    processed_lens = torch.tensor(processed_lens, device=device)
-    encoder_out, encoder_out_lens, new_states = model.encoder.streaming_forward(
+    encoder_out, encoder_out_lens, new_states = streaming_forward(
-        x=features,
+        features=features,
-        x_lens=feature_lens,
+        feature_lens=feature_lens,
        model=model,
        states=states,
        chunk_size=chunk_size,
        left_context_len=left_context_len,
    )
    encoder_out = model.joiner.encoder_proj(encoder_out)
@ -269,6 +489,7 @@ def decode_one_chunk(
    if params.decoding_method == "greedy_search":
        greedy_search(model=model, encoder_out=encoder_out, streams=decode_streams)
    elif params.decoding_method == "fast_beam_search":
        processed_lens = torch.tensor(processed_lens, device=model.device)
        processed_lens = processed_lens + encoder_out_lens
        fast_beam_search_one_best(
            model=model,
@ -295,8 +516,9 @@ def decode_one_chunk(
    for i in range(len(decode_streams)):
        decode_streams[i].states = states[i]
        decode_streams[i].done_frames += encoder_out_lens[i]
-        if decode_streams[i].done:
+        # if decode_streams[i].done:
-            finished_streams.append(i)
+        # finished_streams.append(i)
        finished_streams.append(i)
    return finished_streams
@ -305,7 +527,7 @@ def decode_dataset(
    cuts: CutSet,
    params: AttributeDict,
    model: nn.Module,
-    sp: Tokenizer,
+    tokenizer: Tokenizer,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
@ -317,7 +539,7 @@ def decode_dataset(
        It is returned by :func:`get_params`.
      model:
        The neural model.
-      sp:
+      tokenizer:
        The BPE model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
@ -338,14 +560,14 @@ def decode_dataset(
    opts.frame_opts.samp_freq = 16000
    opts.mel_opts.num_bins = 80
-    log_interval = 50
+    log_interval = 100
    decode_results = []
    # Contain decode streams currently running.
    decode_streams = []
    for num, cut in enumerate(cuts):
        # each utterance has a DecodeStream.
-        initial_states = model.encoder.get_init_state(device=device)
+        initial_states = get_init_states(model=model, batch_size=1, device=device)
        decode_stream = DecodeStream(
            params=params,
            cut_id=cut.id,
@ -361,15 +583,19 @@ def decode_dataset(
        assert audio.dtype == np.float32, audio.dtype
        # The trained model is using normalized samples
-        assert audio.max() <= 1, "Should be normalized to [-1, 1])"
+        # - this is to avoid sending [-32k,+32k] signal in...
        # - some lhotse AudioTransform classes can make the signal
        #   be out of range [-1, 1], hence the tolerance 10
        assert (
            np.abs(audio).max() <= 10
        ), "Should be normalized to [-1, 1], 10 for tolerance..."
        samples = torch.from_numpy(audio).squeeze(0)
        fbank = Fbank(opts)
        feature = fbank(samples.to(device))
-        decode_stream.set_features(feature, tail_pad_len=params.decode_chunk_len)
+        decode_stream.set_features(feature, tail_pad_len=30)
-        decode_stream.ground_truth = cut.supervisions[0].custom[params.transcript_mode]
+        decode_stream.ground_truth = cut.supervisions[0].text
        decode_streams.append(decode_stream)
        while len(decode_streams) >= params.num_decode_streams:
@ -380,8 +606,8 @@ def decode_dataset(
                decode_results.append(
                    (
                        decode_streams[i].id,
-                        sp.text2word(decode_streams[i].ground_truth),
+                        decode_streams[i].ground_truth.split(),
-                        sp.text2word(sp.decode(decode_streams[i].decoding_result())),
+                        tokenizer.decode(decode_streams[i].decoding_result()).split(),
                    )
                )
                del decode_streams[i]
@ -391,18 +617,37 @@ def decode_dataset(
    # decode final chunks of last sequences
    while len(decode_streams):
        # print("INSIDE LEN DECODE STREAMS")
        # pdb.set_trace()
        # print(model.device)
        # test_device = model.device
        # print("done")
        finished_streams = decode_one_chunk(
            params=params, model=model, decode_streams=decode_streams
        )
        # print('INSIDE FOR LOOP ')
        # print(finished_streams)
        if not finished_streams:
            print("No finished streams, breaking the loop")
            break
        for i in sorted(finished_streams, reverse=True):
-            decode_results.append(
+            try:
-                (
+                decode_results.append(
-                    decode_streams[i].id,
+                    (
-                    sp.text2word(decode_streams[i].ground_truth),
+                        decode_streams[i].id,
-                    sp.text2word(sp.decode(decode_streams[i].decoding_result())),
+                        decode_streams[i].ground_truth.split(),
                        tokenizer.decode(decode_streams[i].decoding_result()).split(),
                    )
                )
-            )
+                del decode_streams[i]
-            del decode_streams[i]
+            except IndexError as e:
                print(f"IndexError: {e}")
                print(f"decode_streams length: {len(decode_streams)}")
                print(f"finished_streams: {finished_streams}")
                print(f"i: {i}")
                continue
    if params.decoding_method == "greedy_search":
        key = "greedy_search"
@ -416,9 +661,54 @@ def decode_dataset(
        key = f"num_active_paths_{params.num_active_paths}"
    else:
        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    torch.cuda.synchronize()
    return {key: decode_results}
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
@ -430,16 +720,20 @@ def main():
    params = get_params()
    params.update(vars(args))
-    if not params.res_dir:
+    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
        params.res_dir = params.exp_dir / "streaming" / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    # for streaming
+    assert params.causal, params.causal
-    params.suffix += f"-streaming-chunk-size-{params.decode_chunk_len}"
+    assert "," not in params.chunk_size, "chunk_size should be one value in decoding."
    assert (
        "," not in params.left_context_frames
    ), "left_context_frames should be one value in decoding."
    params.suffix += f"-chunk-{params.chunk_size}"
    params.suffix += f"-left-context-{params.left_context_frames}"
    # for fast_beam_search
    if params.decoding_method == "fast_beam_search":
@ -455,21 +749,21 @@ def main():
    device = torch.device("cpu")
    if torch.cuda.is_available():
-        device = torch.device("cuda", params.gpu)
+        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
-    sp = Tokenizer.load(params.lang, params.lang_type)
+    sp_token = Tokenizer.load(params.lang, params.lang_type)
-    # <blk> and <unk> is defined in local/prepare_lang_char.py
+    # <blk> and <unk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
+    params.blank_id = sp_token.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
+    params.unk_id = sp_token.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
+    params.vocab_size = sp_token.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
-    model = get_transducer_model(params)
+    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
@ -553,42 +847,51 @@ def main():
    model.device = device
    decoding_graph = None
-    if params.decoding_graph:
+    if params.decoding_method == "fast_beam_search":
        decoding_graph = k2.Fsa.from_dict(
            torch.load(params.decoding_graph, map_location=device)
        )
    elif params.decoding_method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    reazonspeech_corpus = ReazonSpeechAsrDataModule(args)
-    for subdir in ["valid"]:
+    valid_cuts = reazonspeech_corpus.valid_cuts()
    test_cuts = reazonspeech_corpus.test_cuts()
    test_sets = ["valid", "test"]
    test_cuts = [valid_cuts, test_cuts]
    for test_set, test_cut in zip(test_sets, test_cuts):
        results_dict = decode_dataset(
-            cuts=getattr(reazonspeech_corpus, f"{subdir}_cuts")(),
+            cuts=test_cut,
            params=params,
            model=model,
-            sp=sp,
+            tokenizer=sp_token,
            decoding_graph=decoding_graph,
        )
-        tot_err = save_results(
+        save_results(
-            params=params, test_set_name=subdir, results_dict=results_dict
+            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
-        with (
+    # valid_cuts = reazonspeech_corpus.valid_cuts()
-            params.res_dir
+
-            / (
+    # for valid_cut in valid_cuts:
-                f"{subdir}-{params.decode_chunk_len}"
+    #     results_dict = decode_dataset(
-                f"_{params.avg}_{params.epoch}.cer"
+    #         cuts=valid_cut,
-            )
+    #         params=params,
-        ).open("w") as fout:
+    #         model=model,
-            if len(tot_err) == 1:
+    #         sp=sp,
-                fout.write(f"{tot_err[0][1]}")
+    #         decoding_graph=decoding_graph,
-            else:
+    #     )
-                fout.write("\n".join(f"{k}\t{v}") for k, v in tot_err)
+    #     save_results(
    #         params=params,
    #         test_set_name="valid",
    #         results_dict=results_dict,
    #     )
    logging.info("Done!")
--- a/icefall/init.py
+++ b/icefall/init.py
@ -68,6 +68,7 @@ from .utils import (
    str2bool,
    subsequent_chunk_mask,
    tokenize_by_CJK_char,
    tokenize_by_ja_char,
    write_error_stats,
 )
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -1758,6 +1758,30 @@ def tokenize_by_CJK_char(line: str) -> str:
    return " ".join([w.strip() for w in chars if w.strip()])
 def tokenize_by_ja_char(line: str) -> str:
    """
    Tokenize a line of text with Japanese characters.
    Note: All non-Japanese characters will be upper case.
    Example:
      input = "こんにちは世界は hello world の日本語"
      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
    Args:
      line:
        The input text.
    Return:
      A new string tokenized by Japanese characters.
    """
    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
    chars = pattern.split(line.strip())
    return " ".join(
        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
    )
 def display_and_save_batch(
    batch: dict,
    params: AttributeDict,
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/prepare_lang.py`
		`@ -0,0 +1 @@`
							`../../../aishell2/ASR/local/prepare_words.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/validate_bpe_lexicon.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/beam_search.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/ctc_decode.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/decode_stream.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/encoder_interface.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/export-onnx.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/generate_averaged_model.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/joiner.py`