customize tranning script for rs

2025-12-11 06:55:27 +00:00 · 2024-01-14 07:45:33 -05:00 · 2024-01-14 07:45:33 -05:00 · 5e9a171b20
commit 5e9a171b20
parent 8eae6ec7d1
19 changed files with 4905 additions and 0 deletions
--- a/egs/reazonspeech/ASR/zipformer/asr_datamodule.py
+++ b/egs/reazonspeech/ASR/zipformer/asr_datamodule.py
@ -0,0 +1 @@
 ../local/utils/asr_datamodule.py
--- a/egs/reazonspeech/ASR/zipformer/beam_search.py
+++ b/egs/reazonspeech/ASR/zipformer/beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/beam_search.py
--- a/egs/reazonspeech/ASR/zipformer/decode.py
+++ b/egs/reazonspeech/ASR/zipformer/decode.py
--- a/egs/reazonspeech/ASR/zipformer/decode_stream.py
+++ b/egs/reazonspeech/ASR/zipformer/decode_stream.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/decode_stream.py
--- a/egs/reazonspeech/ASR/zipformer/decoder.py
+++ b/egs/reazonspeech/ASR/zipformer/decoder.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/decoder.py
--- a/egs/reazonspeech/ASR/zipformer/do_not_use_it_directly.py
+++ b/egs/reazonspeech/ASR/zipformer/do_not_use_it_directly.py
--- a/egs/reazonspeech/ASR/zipformer/encoder_interface.py
+++ b/egs/reazonspeech/ASR/zipformer/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/encoder_interface.py
--- a/egs/reazonspeech/ASR/zipformer/export.py
+++ b/egs/reazonspeech/ASR/zipformer/export.py
@ -0,0 +1,250 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts several saved checkpoints
 # to a single one using model averaging.
 import argparse
 import logging
 from pathlib import Path
 import torch
 from scaling_converter import convert_scaled_to_non_scaled
 from tokenizer import Tokenizer
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=30,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=9,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="pruned_transducer_stateless7_streaming/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--jit",
        type=str2bool,
        default=False,
        help="""True to save a model after applying torch.jit.script.
        It will generate a file named cpu_jit.pt
        Check ./jit_pretrained.py for how to use it.
        """,
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    add_model_arguments(parser)
    return parser
@torch.no_grad()
 def main():
    parser = get_parser()
    Tokenizer.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    sp = Tokenizer.load(params.lang, params.lang_type)
    # <blk> is defined in local/prepare_lang_char.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    model.to(device)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to("cpu")
    model.eval()
    if params.jit is True:
        convert_scaled_to_non_scaled(model, inplace=True)
        # We won't use the forward() method of the model in C++, so just ignore
        # it here.
        # Otherwise, one of its arguments is a ragged tensor and is not
        # torch scriptabe.
        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
        logging.info("Using torch.jit.script")
        model = torch.jit.script(model)
        filename = params.exp_dir / "cpu_jit.pt"
        model.save(str(filename))
        logging.info(f"Saved to {filename}")
    else:
        logging.info("Not using torchscript. Export model.state_dict()")
        # Save it using a format so that it can be loaded
        # by :func:`load_checkpoint`
        filename = params.exp_dir / "pretrained.pt"
        torch.save({"model": model.state_dict()}, str(filename))
        logging.info(f"Saved to {filename}")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/reazonspeech/ASR/zipformer/joiner.py
+++ b/egs/reazonspeech/ASR/zipformer/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/joiner.py
--- a/egs/reazonspeech/ASR/zipformer/model.py
+++ b/egs/reazonspeech/ASR/zipformer/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/model.py
--- a/egs/reazonspeech/ASR/zipformer/optim.py
+++ b/egs/reazonspeech/ASR/zipformer/optim.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/optim.py
--- a/egs/reazonspeech/ASR/zipformer/pretrained.py
+++ b/egs/reazonspeech/ASR/zipformer/pretrained.py
@ -0,0 +1,347 @@
 #!/usr/bin/env python3
 # Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script loads a checkpoint and uses it to decode waves.
 You can generate the checkpoint with the following command:
 ./pruned_transducer_stateless7_streaming/export.py \
  --exp-dir ./pruned_transducer_stateless7_streaming/exp \
  --lang data/lang_char \
  --epoch 20 \
  --avg 10
 Usage of this script:
 (1) greedy search
 ./pruned_transducer_stateless7_streaming/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
    --lang data/lang_char \
    --method greedy_search \
    /path/to/foo.wav \
    /path/to/bar.wav
 (2) beam search
 ./pruned_transducer_stateless7_streaming/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
    --lang data/lang_char \
    --method beam_search \
    --beam-size 4 \
    /path/to/foo.wav \
    /path/to/bar.wav
 (3) modified beam search
 ./pruned_transducer_stateless7_streaming/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
    --lang data/lang_char \
    --method modified_beam_search \
    --beam-size 4 \
    /path/to/foo.wav \
    /path/to/bar.wav
 (4) fast beam search
 ./pruned_transducer_stateless7_streaming/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
    --lang data/lang_char \
    --method fast_beam_search \
    --beam-size 4 \
    /path/to/foo.wav \
    /path/to/bar.wav
 You can also use `./pruned_transducer_stateless7_streaming/exp/epoch-xx.pt`.
 Note: ./pruned_transducer_stateless7_streaming/exp/pretrained.pt is generated by
 ./pruned_transducer_stateless7_streaming/export.py
 """
 import argparse
 import logging
 import math
 from typing import List
 import k2
 import kaldifeat
 import torch
 import torchaudio
 from beam_search import (
    beam_search,
    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
 )
 from tokenizer import Tokenizer
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_params, get_transducer_model
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        required=True,
        help="Path to the checkpoint. "
        "The checkpoint is assumed to be saved by "
        "icefall.checkpoint.save_checkpoint().",
    )
    parser.add_argument(
        "--method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
          - fast_beam_search
        """,
    )
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. "
        "The sample rate has to be 16kHz.",
    )
    parser.add_argument(
        "--sample-rate",
        type=int,
        default=16000,
        help="The sample rate of the input sound file",
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="""An integer indicating how many candidates we will keep for each
        frame. Used only when --method is beam_search or
        modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=4,
        help="""Used only when --method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=8,
        help="""Used only when --method is fast_beam_search""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=1,
        help="""Maximum number of symbols per frame. Used only when
        --method is greedy_search.
        """,
    )
    add_model_arguments(parser)
    return parser
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        assert (
            sample_rate == expected_sample_rate
        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
        # We use only the first channel
        ans.append(wave[0])
    return ans
@torch.no_grad()
 def main():
    parser = get_parser()
    Tokenizer.add_arguments(parser)
    args = parser.parse_args()
    params = get_params()
    params.update(vars(args))
    sp = Tokenizer.load(params.lang, params.lang_type)
    # <blk> is defined in local/prepare_lang_char.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(f"{params}")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    logging.info("Creating model")
    model = get_transducer_model(params)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    checkpoint = torch.load(args.checkpoint, map_location="cpu")
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
    model.device = device
    logging.info("Constructing Fbank computer")
    opts = kaldifeat.FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = params.sample_rate
    opts.mel_opts.num_bins = params.feature_dim
    fbank = kaldifeat.Fbank(opts)
    logging.info(f"Reading sound files: {params.sound_files}")
    waves = read_sound_files(
        filenames=params.sound_files, expected_sample_rate=params.sample_rate
    )
    waves = [w.to(device) for w in waves]
    logging.info("Decoding started")
    features = fbank(waves)
    feature_lengths = [f.size(0) for f in features]
    features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
    feature_lengths = torch.tensor(feature_lengths, device=device)
    encoder_out, encoder_out_lens = model.encoder(x=features, x_lens=feature_lengths)
    num_waves = encoder_out.size(0)
    hyps = []
    msg = f"Using {params.method}"
    if params.method == "beam_search":
        msg += f" with beam size {params.beam_size}"
    logging.info(msg)
    if params.method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    elif params.method == "modified_beam_search":
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam_size,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    else:
        for i in range(num_waves):
            # fmt: off
            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
            # fmt: on
            if params.method == "greedy_search":
                hyp = greedy_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    max_sym_per_frame=params.max_sym_per_frame,
                )
            elif params.method == "beam_search":
                hyp = beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            else:
                raise ValueError(f"Unsupported method: {params.method}")
            hyps.append(sp.decode(hyp).split())
    s = "\n"
    for filename, hyp in zip(params.sound_files, hyps):
        words = " ".join(hyp)
        s += f"{filename}:\n{words}\n\n"
    logging.info(s)
    logging.info("Decoding Done")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/reazonspeech/ASR/zipformer/scaling.py
+++ b/egs/reazonspeech/ASR/zipformer/scaling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling.py
--- a/egs/reazonspeech/ASR/zipformer/scaling_converter.py
+++ b/egs/reazonspeech/ASR/zipformer/scaling_converter.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/scaling_converter.py
--- a/egs/reazonspeech/ASR/zipformer/streaming_beam_search.py
+++ b/egs/reazonspeech/ASR/zipformer/streaming_beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/streaming_beam_search.py
--- a/egs/reazonspeech/ASR/zipformer/streaming_decode.py
+++ b/egs/reazonspeech/ASR/zipformer/streaming_decode.py
@ -0,0 +1,597 @@
 #!/usr/bin/env python3
 # Copyright 2022 Xiaomi Corporation (Authors: Wei Kang, Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 ./pruned_transducer_stateless7_streaming/streaming_decode.py \
  --epoch 28 \
  --avg 15 \
  --decode-chunk-len 32 \
  --exp-dir ./pruned_transducer_stateless7_streaming/exp \
  --decoding_method greedy_search \
  --lang data/lang_char \
  --num-decode-streams 2000
 """
 import argparse
 import logging
 import math
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import numpy as np
 import torch
 import torch.nn as nn
 from asr_datamodule import ReazonSpeechAsrDataModule
 from decode import save_results
 from decode_stream import DecodeStream
 from kaldifeat import Fbank, FbankOptions
 from lhotse import CutSet
 from streaming_beam_search import (
    fast_beam_search_one_best,
    greedy_search,
    modified_beam_search,
 )
 from tokenizer import Tokenizer
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_params, get_transducer_model
 from zipformer import stack_states, unstack_states
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import AttributeDict, setup_logger, str2bool
 LOG_EPS = math.log(1e-10)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=28,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 0.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--gpu",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="pruned_transducer_stateless2/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Supported decoding methods are:
        greedy_search
        modified_beam_search
        fast_beam_search
        """,
    )
    parser.add_argument(
        "--decoding-graph",
        type=str,
        default="",
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--num_active_paths",
        type=int,
        default=4,
        help="""An interger indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4.0,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=4,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=32,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
        "--num-decode-streams",
        type=int,
        default=2000,
        help="The number of streams that can be decoded parallel.",
    )
    parser.add_argument(
        "--res-dir",
        type=Path,
        default=None,
        help="The path to save results.",
    )
    add_model_arguments(parser)
    return parser
 def decode_one_chunk(
    params: AttributeDict,
    model: nn.Module,
    decode_streams: List[DecodeStream],
 ) -> List[int]:
    """Decode one chunk frames of features for each decode_streams and
    return the indexes of finished streams in a List.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      decode_streams:
        A List of DecodeStream, each belonging to a utterance.
    Returns:
      Return a List containing which DecodeStreams are finished.
    """
    device = model.device
    features = []
    feature_lens = []
    states = []
    processed_lens = []
    for stream in decode_streams:
        feat, feat_len = stream.get_feature_frames(params.decode_chunk_len)
        features.append(feat)
        feature_lens.append(feat_len)
        states.append(stream.states)
        processed_lens.append(stream.done_frames)
    feature_lens = torch.tensor(feature_lens, device=device)
    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
    # We subsample features with ((x_len - 7) // 2 + 1) // 2 and the max downsampling
    # factor in encoders is 8.
    # After feature embedding (x_len - 7) // 2, we have (23 - 7) // 2 = 8.
    tail_length = 23
    if features.size(1) < tail_length:
        pad_length = tail_length - features.size(1)
        feature_lens += pad_length
        features = torch.nn.functional.pad(
            features,
            (0, 0, 0, pad_length),
            mode="constant",
            value=LOG_EPS,
        )
    states = stack_states(states)
    processed_lens = torch.tensor(processed_lens, device=device)
    encoder_out, encoder_out_lens, new_states = model.encoder.streaming_forward(
        x=features,
        x_lens=feature_lens,
        states=states,
    )
    encoder_out = model.joiner.encoder_proj(encoder_out)
    if params.decoding_method == "greedy_search":
        greedy_search(model=model, encoder_out=encoder_out, streams=decode_streams)
    elif params.decoding_method == "fast_beam_search":
        processed_lens = processed_lens + encoder_out_lens
        fast_beam_search_one_best(
            model=model,
            encoder_out=encoder_out,
            processed_lens=processed_lens,
            streams=decode_streams,
            beam=params.beam,
            max_states=params.max_states,
            max_contexts=params.max_contexts,
        )
    elif params.decoding_method == "modified_beam_search":
        modified_beam_search(
            model=model,
            streams=decode_streams,
            encoder_out=encoder_out,
            num_active_paths=params.num_active_paths,
        )
    else:
        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    states = unstack_states(new_states)
    finished_streams = []
    for i in range(len(decode_streams)):
        decode_streams[i].states = states[i]
        decode_streams[i].done_frames += encoder_out_lens[i]
        if decode_streams[i].done:
            finished_streams.append(i)
    return finished_streams
 def decode_dataset(
    cuts: CutSet,
    params: AttributeDict,
    model: nn.Module,
    sp: Tokenizer,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      cuts:
        Lhotse Cutset containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      sp:
        The BPE model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    device = model.device
    opts = FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = 16000
    opts.mel_opts.num_bins = 80
    log_interval = 50
    decode_results = []
    # Contain decode streams currently running.
    decode_streams = []
    for num, cut in enumerate(cuts):
        # each utterance has a DecodeStream.
        initial_states = model.encoder.get_init_state(device=device)
        decode_stream = DecodeStream(
            params=params,
            cut_id=cut.id,
            initial_states=initial_states,
            decoding_graph=decoding_graph,
            device=device,
        )
        audio: np.ndarray = cut.load_audio()
        # audio.shape: (1, num_samples)
        assert len(audio.shape) == 2
        assert audio.shape[0] == 1, "Should be single channel"
        assert audio.dtype == np.float32, audio.dtype
        # The trained model is using normalized samples
        assert audio.max() <= 1, "Should be normalized to [-1, 1])"
        samples = torch.from_numpy(audio).squeeze(0)
        fbank = Fbank(opts)
        feature = fbank(samples.to(device))
        decode_stream.set_features(feature, tail_pad_len=params.decode_chunk_len)
        decode_stream.ground_truth = cut.supervisions[0].custom[params.transcript_mode]
        decode_streams.append(decode_stream)
        while len(decode_streams) >= params.num_decode_streams:
            finished_streams = decode_one_chunk(
                params=params, model=model, decode_streams=decode_streams
            )
            for i in sorted(finished_streams, reverse=True):
                decode_results.append(
                    (
                        decode_streams[i].id,
                        sp.text2word(decode_streams[i].ground_truth),
                        sp.text2word(sp.decode(decode_streams[i].decoding_result())),
                    )
                )
                del decode_streams[i]
        if num % log_interval == 0:
            logging.info(f"Cuts processed until now is {num}.")
    # decode final chunks of last sequences
    while len(decode_streams):
        finished_streams = decode_one_chunk(
            params=params, model=model, decode_streams=decode_streams
        )
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
                    decode_streams[i].id,
                    sp.text2word(decode_streams[i].ground_truth),
                    sp.text2word(sp.decode(decode_streams[i].decoding_result())),
                )
            )
            del decode_streams[i]
    if params.decoding_method == "greedy_search":
        key = "greedy_search"
    elif params.decoding_method == "fast_beam_search":
        key = (
            f"beam_{params.beam}_"
            f"max_contexts_{params.max_contexts}_"
            f"max_states_{params.max_states}"
        )
    elif params.decoding_method == "modified_beam_search":
        key = f"num_active_paths_{params.num_active_paths}"
    else:
        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    return {key: decode_results}
@torch.no_grad()
 def main():
    parser = get_parser()
    ReazonSpeechAsrDataModule.add_arguments(parser)
    Tokenizer.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    if not params.res_dir:
        params.res_dir = params.exp_dir / "streaming" / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    # for streaming
    params.suffix += f"-streaming-chunk-size-{params.decode_chunk_len}"
    # for fast_beam_search
    if params.decoding_method == "fast_beam_search":
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", params.gpu)
    logging.info(f"Device: {device}")
    sp = Tokenizer.load(params.lang, params.lang_type)
    # <blk> and <unk> is defined in local/prepare_lang_char.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if start >= 0:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to(device)
    model.eval()
    model.device = device
    decoding_graph = None
    if params.decoding_graph:
        decoding_graph = k2.Fsa.from_dict(
            torch.load(params.decoding_graph, map_location=device)
        )
    elif params.decoding_method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    args.return_cuts = True
    reazonspeech_corpus = ReazonSpeechAsrDataModule(args)
    for subdir in ["valid"]:
        results_dict = decode_dataset(
            cuts=getattr(reazonspeech_corpus, f"{subdir}_cuts")(),
            params=params,
            model=model,
            sp=sp,
            decoding_graph=decoding_graph,
        )
        tot_err = save_results(
            params=params, test_set_name=subdir, results_dict=results_dict
        )
        with (
            params.res_dir
            / (
                f"{subdir}-{params.decode_chunk_len}"
                f"_{params.avg}_{params.epoch}.cer"
            )
        ).open("w") as fout:
            if len(tot_err) == 1:
                fout.write(f"{tot_err[0][1]}")
            else:
                fout.write("\n".join(f"{k}\t{v}") for k, v in tot_err)
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/reazonspeech/ASR/zipformer/tokenizer.py
+++ b/egs/reazonspeech/ASR/zipformer/tokenizer.py
@ -0,0 +1 @@
 ../local/utils/tokenizer.py
--- a/egs/reazonspeech/ASR/zipformer/train.py
+++ b/egs/reazonspeech/ASR/zipformer/train.py
--- a/egs/reazonspeech/ASR/zipformer/zipformer.py
+++ b/egs/reazonspeech/ASR/zipformer/zipformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/zipformer.py
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/beam_search.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/decode_stream.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/encoder_interface.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/joiner.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/scaling.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/streaming_beam_search.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/zipformer/zipformer.py`