diff --git a/egs/librispeech/SSL/hubert/beam_search.py b/egs/librispeech/SSL/hubert/beam_search.py
deleted file mode 100644
index 7fcd242fc..000000000
--- a/egs/librispeech/SSL/hubert/beam_search.py
+++ /dev/null
@@ -1,2942 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
-#                                                  Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Tuple, Union
-
-import k2
-import sentencepiece as spm
-import torch
-from torch import nn
-
-from icefall import ContextGraph, ContextState, NgramLm, NgramLmStateCost
-from icefall.decode import Nbest, one_best_decoding
-from icefall.lm_wrapper import LmScorer
-from icefall.rnn_lm.model import RnnLmModel
-from icefall.transformer_lm.model import TransformerLM
-from icefall.utils import (
-    DecodingResults,
-    add_eos,
-    add_sos,
-    get_texts,
-    get_texts_with_timestamp,
-)
-
-
-def fast_beam_search_one_best(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    temperature: float = 1.0,
-    ilme_scale: float = 0.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first obtained using fast beam search, and then
-    the shortest path within the lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-        ilme_scale=ilme_scale,
-        allow_partial=allow_partial,
-        blank_penalty=blank_penalty,
-    )
-
-    best_path = one_best_decoding(lattice)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search_nbest_LG(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    num_paths: int,
-    nbest_scale: float = 0.5,
-    use_double_scores: bool = True,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    ilme_scale: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    The process to get the results is:
-     - (1) Use fast beam search to get a lattice
-     - (2) Select `num_paths` paths from the lattice using k2.random_paths()
-     - (3) Unique the selected paths
-     - (4) Intersect the selected paths with the lattice and compute the
-           shortest path from the intersection result
-     - (5) The path with the largest score is used as the decoding output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-        allow_partial=allow_partial,
-        blank_penalty=blank_penalty,
-        ilme_scale=ilme_scale,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-
-    # The following code is modified from nbest.intersect()
-    word_fsa = k2.invert(nbest.fsa)
-    if hasattr(lattice, "aux_labels"):
-        # delete token IDs as it is not needed
-        del word_fsa.aux_labels
-    word_fsa.scores.zero_()
-    word_fsa_with_epsilon_loops = k2.linear_fsa_with_self_loops(word_fsa)
-    path_to_utt_map = nbest.shape.row_ids(1)
-
-    if hasattr(lattice, "aux_labels"):
-        # lattice has token IDs as labels and word IDs as aux_labels.
-        # inv_lattice has word IDs as labels and token IDs as aux_labels
-        inv_lattice = k2.invert(lattice)
-        inv_lattice = k2.arc_sort(inv_lattice)
-    else:
-        inv_lattice = k2.arc_sort(lattice)
-
-    if inv_lattice.shape[0] == 1:
-        path_lattice = k2.intersect_device(
-            inv_lattice,
-            word_fsa_with_epsilon_loops,
-            b_to_a_map=torch.zeros_like(path_to_utt_map),
-            sorted_match_a=True,
-        )
-    else:
-        path_lattice = k2.intersect_device(
-            inv_lattice,
-            word_fsa_with_epsilon_loops,
-            b_to_a_map=path_to_utt_map,
-            sorted_match_a=True,
-        )
-
-    # path_lattice has word IDs as labels and token IDs as aux_labels
-    path_lattice = k2.top_sort(k2.connect(path_lattice))
-    tot_scores = path_lattice.get_tot_scores(
-        use_double_scores=use_double_scores,
-        log_semiring=True,  # Note: we always use True
-    )
-    # See https://github.com/k2-fsa/icefall/pull/420 for why
-    # we always use log_semiring=True
-
-    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-    best_hyp_indexes = ragged_tot_scores.argmax()
-    best_path = k2.index_fsa(nbest.fsa, best_hyp_indexes)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search_nbest(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    num_paths: int,
-    nbest_scale: float = 0.5,
-    use_double_scores: bool = True,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    The process to get the results is:
-     - (1) Use fast beam search to get a lattice
-     - (2) Select `num_paths` paths from the lattice using k2.random_paths()
-     - (3) Unique the selected paths
-     - (4) Intersect the selected paths with the lattice and compute the
-           shortest path from the intersection result
-     - (5) The path with the largest score is used as the decoding output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        blank_penalty=blank_penalty,
-        temperature=temperature,
-        allow_partial=allow_partial,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-
-    # at this point, nbest.fsa.scores are all zeros.
-
-    nbest = nbest.intersect(lattice)
-    # Now nbest.fsa.scores contains acoustic scores
-
-    max_indexes = nbest.tot_scores().argmax()
-
-    best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search_nbest_oracle(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    num_paths: int,
-    ref_texts: List[List[int]],
-    use_double_scores: bool = True,
-    nbest_scale: float = 0.5,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first obtained using fast beam search, and then
-    we select `num_paths` linear paths from the lattice. The path
-    that has the minimum edit distance with the given reference transcript
-    is used as the output.
-
-    This is the best result we can achieve for any nbest based rescoring
-    methods.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      ref_texts:
-        A list-of-list of integers containing the reference transcripts.
-        If the decoding_graph is a trivial_graph, the integer ID is the
-        BPE token ID.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-        allow_partial=allow_partial,
-        blank_penalty=blank_penalty,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-
-    hyps = nbest.build_levenshtein_graphs()
-    refs = k2.levenshtein_graph(ref_texts, device=hyps.device)
-
-    levenshtein_alignment = k2.levenshtein_alignment(
-        refs=refs,
-        hyps=hyps,
-        hyp_to_ref_map=nbest.shape.row_ids(1),
-        sorted_match_ref=True,
-    )
-
-    tot_scores = levenshtein_alignment.get_tot_scores(
-        use_double_scores=False, log_semiring=False
-    )
-    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-
-    max_indexes = ragged_tot_scores.argmax()
-
-    best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    temperature: float = 1.0,
-    subtract_ilme: bool = False,
-    ilme_scale: float = 0.1,
-    allow_partial: bool = False,
-    blank_penalty: float = 0.0,
-) -> k2.Fsa:
-    """It limits the maximum number of symbols per frame to 1.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      temperature:
-        Softmax temperature.
-    Returns:
-      Return an FsaVec with axes [utt][state][arc] containing the decoded
-      lattice. Note: When the input graph is a TrivialGraph, the returned
-      lattice is actually an acceptor.
-    """
-    assert encoder_out.ndim == 3
-
-    context_size = model.decoder.context_size
-    vocab_size = model.decoder.vocab_size
-
-    B, T, C = encoder_out.shape
-
-    config = k2.RnntDecodingConfig(
-        vocab_size=vocab_size,
-        decoder_history_len=context_size,
-        beam=beam,
-        max_contexts=max_contexts,
-        max_states=max_states,
-    )
-    individual_streams = []
-    for i in range(B):
-        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
-    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # shape is a RaggedShape of shape (B, context)
-        # contexts is a Tensor of shape (shape.NumElements(), context_size)
-        shape, contexts = decoding_streams.get_contexts()
-        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
-        contexts = contexts.to(torch.int64)
-        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
-        decoder_out = model.decoder(contexts, need_pad=False)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # current_encoder_out is of shape
-        # (shape.NumElements(), 1, joiner_dim)
-        # fmt: off
-        current_encoder_out = torch.index_select(
-            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
-        )
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out.unsqueeze(2),
-            decoder_out.unsqueeze(1),
-            project_input=False,
-        )
-        logits = logits.squeeze(1).squeeze(1)
-
-        if blank_penalty != 0:
-            logits[:, 0] -= blank_penalty
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)
-
-        if ilme_scale != 0:
-            ilme_logits = model.joiner(
-                torch.zeros_like(
-                    current_encoder_out, device=current_encoder_out.device
-                ).unsqueeze(2),
-                decoder_out.unsqueeze(1),
-                project_input=False,
-            )
-            ilme_logits = ilme_logits.squeeze(1).squeeze(1)
-            if blank_penalty != 0:
-                ilme_logits[:, 0] -= blank_penalty
-            ilme_log_probs = (ilme_logits / temperature).log_softmax(dim=-1)
-            log_probs -= ilme_scale * ilme_log_probs
-
-        decoding_streams.advance(log_probs)
-    decoding_streams.terminate_and_flush_to_streams()
-    lattice = decoding_streams.format_output(
-        encoder_out_lens.tolist(), allow_partial=allow_partial
-    )
-
-    return lattice
-
-
-def greedy_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    max_sym_per_frame: int,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """Greedy search for a single utterance.
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      max_sym_per_frame:
-        Maximum number of symbols per frame. If it is set to 0, the WER
-        would be 100%.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-    unk_id = getattr(model, "unk_id", blank_id)
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [-1] * (context_size - 1) + [blank_id], device=device, dtype=torch.int64
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-    hyp = [blank_id] * context_size
-
-    # timestamp[i] is the frame index after subsampling
-    # on which hyp[i] is decoded
-    timestamp = []
-
-    # Maximum symbols per utterance.
-    max_sym_per_utt = 1000
-
-    # symbols per frame
-    sym_per_frame = 0
-
-    # symbols per utterance decoded so far
-    sym_per_utt = 0
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        if sym_per_frame >= max_sym_per_frame:
-            sym_per_frame = 0
-            t += 1
-            continue
-
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits is (1, 1, 1, vocab_size)
-
-        if blank_penalty != 0:
-            logits[:, :, :, 0] -= blank_penalty
-
-        y = logits.argmax().item()
-        if y not in (blank_id, unk_id):
-            hyp.append(y)
-            timestamp.append(t)
-            decoder_input = torch.tensor([hyp[-context_size:]], device=device).reshape(
-                1, context_size
-            )
-
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-            sym_per_utt += 1
-            sym_per_frame += 1
-        else:
-            sym_per_frame = 0
-            t += 1
-    hyp = hyp[context_size:]  # remove blanks
-
-    if not return_timestamps:
-        return hyp
-    else:
-        return DecodingResults(
-            hyps=[hyp],
-            timestamps=[timestamp],
-        )
-
-
-def greedy_search_batch(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    blank_penalty: float = 0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C), where N >= 1.
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    device = next(model.parameters()).device
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    hyps = [[-1] * (context_size - 1) + [blank_id] for _ in range(N)]
-
-    # timestamp[n][i] is the frame index after subsampling
-    # on which hyp[n][i] is decoded
-    timestamps = [[] for _ in range(N)]
-    # scores[n][i] is the logits on which hyp[n][i] is decoded
-    scores = [[] for _ in range(N)]
-
-    decoder_input = torch.tensor(
-        hyps,
-        device=device,
-        dtype=torch.int64,
-    )  # (N, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-    # decoder_out: (N, 1, decoder_out_dim)
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        decoder_out = decoder_out[:batch_size]
-
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits'shape (batch_size, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
-        assert logits.ndim == 2, logits.shape
-
-        if blank_penalty != 0:
-            logits[:, 0] -= blank_penalty
-
-        y = logits.argmax(dim=1).tolist()
-        emitted = False
-        for i, v in enumerate(y):
-            if v not in (blank_id, unk_id):
-                hyps[i].append(v)
-                timestamps[i].append(t)
-                scores[i].append(logits[i, v].item())
-                emitted = True
-        if emitted:
-            # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
-            decoder_input = torch.tensor(
-                decoder_input,
-                device=device,
-                dtype=torch.int64,
-            )
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    sorted_ans = [h[context_size:] for h in hyps]
-    ans = []
-    ans_timestamps = []
-    ans_scores = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(timestamps[unsorted_indices[i]])
-        ans_scores.append(scores[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-            scores=ans_scores,
-        )
-
-
-@dataclass
-class Hypothesis:
-    # The predicted tokens so far.
-    # Newly predicted tokens are appended to `ys`.
-    ys: List[int]
-
-    # The log prob of ys.
-    # It contains only one entry.
-    log_prob: torch.Tensor
-
-    # timestamp[i] is the frame index after subsampling
-    # on which ys[i] is decoded
-    timestamp: List[int] = field(default_factory=list)
-
-    # the lm score for next token given the current ys
-    lm_score: Optional[torch.Tensor] = None
-
-    # the RNNLM states (h and c in LSTM)
-    state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
-
-    # N-gram LM state
-    state_cost: Optional[NgramLmStateCost] = None
-
-    # Context graph state
-    context_state: Optional[ContextState] = None
-
-    @property
-    def key(self) -> str:
-        """Return a string representation of self.ys"""
-        return "_".join(map(str, self.ys))
-
-
-class HypothesisList(object):
-    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
-        """
-        Args:
-          data:
-            A dict of Hypotheses. Its key is its `value.key`.
-        """
-        if data is None:
-            self._data = {}
-        else:
-            self._data = data
-
-    @property
-    def data(self) -> Dict[str, Hypothesis]:
-        return self._data
-
-    def add(self, hyp: Hypothesis) -> None:
-        """Add a Hypothesis to `self`.
-
-        If `hyp` already exists in `self`, its probability is updated using
-        `log-sum-exp` with the existed one.
-
-        Args:
-          hyp:
-            The hypothesis to be added.
-        """
-        key = hyp.key
-        if key in self:
-            old_hyp = self._data[key]  # shallow copy
-            torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
-        else:
-            self._data[key] = hyp
-
-    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
-        """Get the most probable hypothesis, i.e., the one with
-        the largest `log_prob`.
-
-        Args:
-          length_norm:
-            If True, the `log_prob` of a hypothesis is normalized by the
-            number of tokens in it.
-        Returns:
-          Return the hypothesis that has the largest `log_prob`.
-        """
-        if length_norm:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
-        else:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
-
-    def remove(self, hyp: Hypothesis) -> None:
-        """Remove a given hypothesis.
-
-        Caution:
-          `self` is modified **in-place**.
-
-        Args:
-          hyp:
-            The hypothesis to be removed from `self`.
-            Note: It must be contained in `self`. Otherwise,
-            an exception is raised.
-        """
-        key = hyp.key
-        assert key in self, f"{key} does not exist"
-        del self._data[key]
-
-    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
-        """Remove all Hypotheses whose log_prob is less than threshold.
-
-        Caution:
-          `self` is not modified. Instead, a new HypothesisList is returned.
-
-        Returns:
-          Return a new HypothesisList containing all hypotheses from `self`
-          with `log_prob` being greater than the given `threshold`.
-        """
-        ans = HypothesisList()
-        for _, hyp in self._data.items():
-            if hyp.log_prob > threshold:
-                ans.add(hyp)  # shallow copy
-        return ans
-
-    def topk(self, k: int, length_norm: bool = False) -> "HypothesisList":
-        """Return the top-k hypothesis.
-
-        Args:
-          length_norm:
-            If True, the `log_prob` of a hypothesis is normalized by the
-            number of tokens in it.
-        """
-        hyps = list(self._data.items())
-
-        if length_norm:
-            hyps = sorted(
-                hyps, key=lambda h: h[1].log_prob / len(h[1].ys), reverse=True
-            )[:k]
-        else:
-            hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
-
-        ans = HypothesisList(dict(hyps))
-        return ans
-
-    def __contains__(self, key: str):
-        return key in self._data
-
-    def __iter__(self):
-        return iter(self._data.values())
-
-    def __len__(self) -> int:
-        return len(self._data)
-
-    def __str__(self) -> str:
-        s = []
-        for key in self:
-            s.append(key)
-        return ", ".join(s)
-
-
-def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
-    """Return a ragged shape with axes [utt][num_hyps].
-
-    Args:
-      hyps:
-        len(hyps) == batch_size. It contains the current hypothesis for
-        each utterance in the batch.
-    Returns:
-      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
-      the shape is on CPU.
-    """
-    num_hyps = [len(h) for h in hyps]
-
-    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
-    # to get exclusive sum later.
-    num_hyps.insert(0, 0)
-
-    num_hyps = torch.tensor(num_hyps)
-    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
-    ans = k2.ragged.create_ragged_shape2(
-        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
-    )
-    return ans
-
-
-def modified_beam_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    context_graph: Optional[ContextGraph] = None,
-    beam: int = 4,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                context_state=None if context_graph is None else context_graph.root,
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        if blank_penalty != 0:
-            logits[:, 0] -= blank_penalty
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                context_score = 0
-                new_context_state = None if context_graph is None else hyp.context_state
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    new_timestamp.append(t)
-                    if context_graph is not None:
-                        (
-                            context_score,
-                            new_context_state,
-                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
-
-                new_log_prob = topk_log_probs[k] + context_score
-
-                new_hyp = Hypothesis(
-                    ys=new_ys,
-                    log_prob=new_log_prob,
-                    timestamp=new_timestamp,
-                    context_state=new_context_state,
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # finalize context_state, if the matched contexts do not reach final state
-    # we need to add the score on the corresponding backoff arc
-    if context_graph is not None:
-        finalized_B = [HypothesisList() for _ in range(len(B))]
-        for i, hyps in enumerate(B):
-            for hyp in list(hyps):
-                context_score, new_context_state = context_graph.finalize(
-                    hyp.context_state
-                )
-                finalized_B[i].add(
-                    Hypothesis(
-                        ys=hyp.ys,
-                        log_prob=hyp.log_prob + context_score,
-                        timestamp=hyp.timestamp,
-                        context_state=new_context_state,
-                    )
-                )
-        B = finalized_B
-
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    sorted_timestamps = [h.timestamp for h in best_hyps]
-    ans = []
-    ans_timestamps = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-        )
-
-
-def modified_beam_search_lm_rescore(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LM: LmScorer,
-    lm_scale_list: List[int],
-    beam: int = 4,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-    Rescore the final results with RNNLM and return the one with the highest score
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-      LM:
-        A neural network language model
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    new_timestamp.append(t)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(
-                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # get the am_scores for n-best list
-    hyps_shape = get_hyps_shape(B)
-    am_scores = torch.tensor([hyp.log_prob.item() for b in B for hyp in b])
-    am_scores = k2.RaggedTensor(value=am_scores, shape=hyps_shape).to(device)
-
-    # now LM rescore
-    # prepare input data to LM
-    candidate_seqs = [hyp.ys[context_size:] for b in B for hyp in b]
-    possible_seqs = k2.RaggedTensor(candidate_seqs)
-    row_splits = possible_seqs.shape.row_splits(1)
-    sentence_token_lengths = row_splits[1:] - row_splits[:-1]
-    possible_seqs_with_sos = add_sos(possible_seqs, sos_id=1)
-    possible_seqs_with_eos = add_eos(possible_seqs, eos_id=1)
-    sentence_token_lengths += 1
-
-    x = possible_seqs_with_sos.pad(mode="constant", padding_value=blank_id)
-    y = possible_seqs_with_eos.pad(mode="constant", padding_value=blank_id)
-    x = x.to(device).to(torch.int64)
-    y = y.to(device).to(torch.int64)
-    sentence_token_lengths = sentence_token_lengths.to(device).to(torch.int64)
-
-    lm_scores = LM.lm(x=x, y=y, lengths=sentence_token_lengths)
-    assert lm_scores.ndim == 2
-    lm_scores = -1 * lm_scores.sum(dim=1)
-
-    ans = {}
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-
-    # get the best hyp with different lm_scale
-    for lm_scale in lm_scale_list:
-        key = f"nnlm_scale_{lm_scale:.2f}"
-        tot_scores = am_scores.values + lm_scores * lm_scale
-        ragged_tot_scores = k2.RaggedTensor(shape=am_scores.shape, value=tot_scores)
-        max_indexes = ragged_tot_scores.argmax().tolist()
-        unsorted_hyps = [candidate_seqs[idx] for idx in max_indexes]
-        hyps = []
-        for idx in unsorted_indices:
-            hyps.append(unsorted_hyps[idx])
-
-        ans[key] = hyps
-    return ans
-
-
-def modified_beam_search_lm_rescore_LODR(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LM: LmScorer,
-    LODR_lm: NgramLm,
-    sp: spm.SentencePieceProcessor,
-    lm_scale_list: List[int],
-    beam: int = 4,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-    Rescore the final results with RNNLM and return the one with the highest score
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-      LM:
-        A neural network language model
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    new_timestamp.append(t)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(
-                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # get the am_scores for n-best list
-    hyps_shape = get_hyps_shape(B)
-    am_scores = torch.tensor([hyp.log_prob.item() for b in B for hyp in b])
-    am_scores = k2.RaggedTensor(value=am_scores, shape=hyps_shape).to(device)
-
-    # now LM rescore
-    # prepare input data to LM
-    candidate_seqs = [hyp.ys[context_size:] for b in B for hyp in b]
-    possible_seqs = k2.RaggedTensor(candidate_seqs)
-    row_splits = possible_seqs.shape.row_splits(1)
-    sentence_token_lengths = row_splits[1:] - row_splits[:-1]
-    possible_seqs_with_sos = add_sos(possible_seqs, sos_id=1)
-    possible_seqs_with_eos = add_eos(possible_seqs, eos_id=1)
-    sentence_token_lengths += 1
-
-    x = possible_seqs_with_sos.pad(mode="constant", padding_value=blank_id)
-    y = possible_seqs_with_eos.pad(mode="constant", padding_value=blank_id)
-    x = x.to(device).to(torch.int64)
-    y = y.to(device).to(torch.int64)
-    sentence_token_lengths = sentence_token_lengths.to(device).to(torch.int64)
-
-    lm_scores = LM.lm(x=x, y=y, lengths=sentence_token_lengths)
-    assert lm_scores.ndim == 2
-    lm_scores = -1 * lm_scores.sum(dim=1)
-
-    # now LODR scores
-    import math
-
-    LODR_scores = []
-    for seq in candidate_seqs:
-        tokens = " ".join(sp.id_to_piece(seq))
-        LODR_scores.append(LODR_lm.score(tokens))
-    LODR_scores = torch.tensor(LODR_scores).to(device) * math.log(
-        10
-    )  # arpa scores are 10-based
-    assert lm_scores.shape == LODR_scores.shape
-
-    ans = {}
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-
-    LODR_scale_list = [0.05 * i for i in range(1, 20)]
-    # get the best hyp with different lm_scale and lodr_scale
-    for lm_scale in lm_scale_list:
-        for lodr_scale in LODR_scale_list:
-            key = f"nnlm_scale_{lm_scale:.2f}_lodr_scale_{lodr_scale:.2f}"
-            tot_scores = (
-                am_scores.values / lm_scale + lm_scores - LODR_scores * lodr_scale
-            )
-            ragged_tot_scores = k2.RaggedTensor(shape=am_scores.shape, value=tot_scores)
-            max_indexes = ragged_tot_scores.argmax().tolist()
-            unsorted_hyps = [candidate_seqs[idx] for idx in max_indexes]
-            hyps = []
-            for idx in unsorted_indices:
-                hyps.append(unsorted_hyps[idx])
-
-            ans[key] = hyps
-    return ans
-
-
-def _deprecated_modified_beam_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    It decodes only one utterance at a time. We keep it only for reference.
-    The function :func:`modified_beam_search` should be preferred as it
-    supports batch decoding.
-
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-      return_timestamps:
-        Whether to return timestamps.
-
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    device = next(model.parameters()).device
-
-    T = encoder_out.size(1)
-
-    B = HypothesisList()
-    B.add(
-        Hypothesis(
-            ys=[-1] * (context_size - 1) + [blank_id],
-            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-            timestamp=[],
-        )
-    )
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # current_encoder_out is of shape (1, 1, 1, encoder_out_dim)
-        # fmt: on
-        A = list(B)
-        B = HypothesisList()
-
-        ys_log_probs = torch.cat([hyp.log_prob.reshape(1, 1) for hyp in A])
-        # ys_log_probs is of shape (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyp in A],
-            device=device,
-            dtype=torch.int64,
-        )
-        # decoder_input is of shape (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_output is of shape (num_hyps, 1, 1, joiner_dim)
-
-        current_encoder_out = current_encoder_out.expand(
-            decoder_out.size(0), 1, 1, -1
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )
-        # logits is of shape (num_hyps, 1, 1, vocab_size)
-        logits = logits.squeeze(1).squeeze(1)
-
-        # now logits is of shape (num_hyps, vocab_size)
-        log_probs = logits.log_softmax(dim=-1)
-
-        log_probs.add_(ys_log_probs)
-
-        log_probs = log_probs.reshape(-1)
-        topk_log_probs, topk_indexes = log_probs.topk(beam)
-
-        # topk_hyp_indexes are indexes into `A`
-        topk_hyp_indexes = topk_indexes // logits.size(-1)
-        topk_token_indexes = topk_indexes % logits.size(-1)
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            topk_hyp_indexes = topk_hyp_indexes.tolist()
-            topk_token_indexes = topk_token_indexes.tolist()
-
-        for i in range(len(topk_hyp_indexes)):
-            hyp = A[topk_hyp_indexes[i]]
-            new_ys = hyp.ys[:]
-            new_timestamp = hyp.timestamp[:]
-            new_token = topk_token_indexes[i]
-            if new_token not in (blank_id, unk_id):
-                new_ys.append(new_token)
-                new_timestamp.append(t)
-            new_log_prob = topk_log_probs[i]
-            new_hyp = Hypothesis(
-                ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
-            )
-            B.add(new_hyp)
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-
-    if not return_timestamps:
-        return ys
-    else:
-        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
-
-
-def beam_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """
-    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
-
-    espnet/nets/beam_search_transducer.py#L247 is used as a reference.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [blank_id] * context_size,
-        device=device,
-        dtype=torch.int64,
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-
-    B = HypothesisList()
-    B.add(
-        Hypothesis(
-            ys=[-1] * (context_size - 1) + [blank_id], log_prob=0.0, timestamp=[]
-        )
-    )
-
-    max_sym_per_utt = 20000
-
-    sym_per_utt = 0
-
-    decoder_cache: Dict[str, torch.Tensor] = {}
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        A = B
-        B = HypothesisList()
-
-        joint_cache: Dict[str, torch.Tensor] = {}
-
-        # TODO(fangjun): Implement prefix search to update the `log_prob`
-        # of hypotheses in A
-
-        while True:
-            y_star = A.get_most_probable()
-            A.remove(y_star)
-
-            cached_key = y_star.key
-
-            if cached_key not in decoder_cache:
-                decoder_input = torch.tensor(
-                    [y_star.ys[-context_size:]],
-                    device=device,
-                    dtype=torch.int64,
-                ).reshape(1, context_size)
-
-                decoder_out = model.decoder(decoder_input, need_pad=False)
-                decoder_out = model.joiner.decoder_proj(decoder_out)
-                decoder_cache[cached_key] = decoder_out
-            else:
-                decoder_out = decoder_cache[cached_key]
-
-            cached_key += f"-t-{t}"
-            if cached_key not in joint_cache:
-                logits = model.joiner(
-                    current_encoder_out,
-                    decoder_out.unsqueeze(1),
-                    project_input=False,
-                )
-
-                if blank_penalty != 0:
-                    logits[:, :, :, 0] -= blank_penalty
-
-                # TODO(fangjun): Scale the blank posterior
-                log_prob = (logits / temperature).log_softmax(dim=-1)
-                # log_prob is (1, 1, 1, vocab_size)
-                log_prob = log_prob.squeeze()
-                # Now log_prob is (vocab_size,)
-                joint_cache[cached_key] = log_prob
-            else:
-                log_prob = joint_cache[cached_key]
-
-            # First, process the blank symbol
-            skip_log_prob = log_prob[blank_id]
-            new_y_star_log_prob = y_star.log_prob + skip_log_prob
-
-            # ys[:] returns a copy of ys
-            B.add(
-                Hypothesis(
-                    ys=y_star.ys[:],
-                    log_prob=new_y_star_log_prob,
-                    timestamp=y_star.timestamp[:],
-                )
-            )
-
-            # Second, process other non-blank labels
-            values, indices = log_prob.topk(beam + 1)
-            for i, v in zip(indices.tolist(), values.tolist()):
-                if i in (blank_id, unk_id):
-                    continue
-                new_ys = y_star.ys + [i]
-                new_log_prob = y_star.log_prob + v
-                new_timestamp = y_star.timestamp + [t]
-                A.add(
-                    Hypothesis(
-                        ys=new_ys,
-                        log_prob=new_log_prob,
-                        timestamp=new_timestamp,
-                    )
-                )
-
-            # Check whether B contains more than "beam" elements more probable
-            # than the most probable in A
-            A_most_probable = A.get_most_probable()
-
-            kept_B = B.filter(A_most_probable.log_prob)
-
-            if len(kept_B) >= beam:
-                B = kept_B.topk(beam)
-                break
-
-        t += 1
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-
-    if not return_timestamps:
-        return ys
-    else:
-        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
-
-
-def fast_beam_search_with_nbest_rescoring(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    ngram_lm_scale_list: List[float],
-    num_paths: int,
-    G: k2.Fsa,
-    sp: spm.SentencePieceProcessor,
-    word_table: k2.SymbolTable,
-    oov_word: str = "<UNK>",
-    use_double_scores: bool = True,
-    nbest_scale: float = 0.5,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Dict[str, Union[List[List[int]], DecodingResults]]:
-    """It limits the maximum number of symbols per frame to 1.
-    A lattice is first obtained using fast beam search, num_path are selected
-    and rescored using a given language model. The shortest path within the
-    lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi.
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      ngram_lm_scale_list:
-        A list of floats representing LM score scales.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      G:
-        An FsaVec containing only a single FSA. It is an n-gram LM.
-      sp:
-        The BPE model.
-      word_table:
-        The word symbol table.
-      oov_word:
-        OOV words are replaced with this word.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      Return the decoded result in a dict, where the key has the form
-      'ngram_lm_scale_xx' and the value is the decoded results
-      optionally with timestamps. `xx` is the ngram LM scale value
-      used during decoding, i.e., 0.1.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-    # at this point, nbest.fsa.scores are all zeros.
-
-    nbest = nbest.intersect(lattice)
-    # Now nbest.fsa.scores contains acoustic scores
-
-    am_scores = nbest.tot_scores()
-
-    # Now we need to compute the LM scores of each path.
-    # (1) Get the token IDs of each Path. We assume the decoding_graph
-    # is an acceptor, i.e., lattice is also an acceptor
-    tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)  # [path][arc]
-
-    tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.labels.contiguous())
-    tokens = tokens.remove_values_leq(0)  # remove -1 and 0
-
-    token_list: List[List[int]] = tokens.tolist()
-    word_list: List[List[str]] = sp.decode(token_list)
-
-    assert isinstance(oov_word, str), oov_word
-    assert oov_word in word_table, oov_word
-    oov_word_id = word_table[oov_word]
-
-    word_ids_list: List[List[int]] = []
-
-    for words in word_list:
-        this_word_ids = []
-        for w in words.split():
-            if w in word_table:
-                this_word_ids.append(word_table[w])
-            else:
-                this_word_ids.append(oov_word_id)
-        word_ids_list.append(this_word_ids)
-
-    word_fsas = k2.linear_fsa(word_ids_list, device=lattice.device)
-    word_fsas_with_self_loops = k2.add_epsilon_self_loops(word_fsas)
-
-    num_unique_paths = len(word_ids_list)
-
-    b_to_a_map = torch.zeros(
-        num_unique_paths,
-        dtype=torch.int32,
-        device=lattice.device,
-    )
-
-    rescored_word_fsas = k2.intersect_device(
-        a_fsas=G,
-        b_fsas=word_fsas_with_self_loops,
-        b_to_a_map=b_to_a_map,
-        sorted_match_a=True,
-        ret_arc_maps=False,
-    )
-
-    rescored_word_fsas = k2.remove_epsilon_self_loops(rescored_word_fsas)
-    rescored_word_fsas = k2.top_sort(k2.connect(rescored_word_fsas))
-    ngram_lm_scores = rescored_word_fsas.get_tot_scores(
-        use_double_scores=True,
-        log_semiring=False,
-    )
-
-    ans: Dict[str, Union[List[List[int]], DecodingResults]] = {}
-    for s in ngram_lm_scale_list:
-        key = f"ngram_lm_scale_{s}"
-        tot_scores = am_scores.values + s * ngram_lm_scores
-        ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-        max_indexes = ragged_tot_scores.argmax()
-        best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-        if not return_timestamps:
-            ans[key] = get_texts(best_path)
-        else:
-            ans[key] = get_texts_with_timestamp(best_path)
-
-    return ans
-
-
-def fast_beam_search_with_nbest_rnn_rescoring(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    ngram_lm_scale_list: List[float],
-    num_paths: int,
-    G: k2.Fsa,
-    sp: spm.SentencePieceProcessor,
-    word_table: k2.SymbolTable,
-    rnn_lm_model: torch.nn.Module,
-    rnn_lm_scale_list: List[float],
-    oov_word: str = "<UNK>",
-    use_double_scores: bool = True,
-    nbest_scale: float = 0.5,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Dict[str, Union[List[List[int]], DecodingResults]]:
-    """It limits the maximum number of symbols per frame to 1.
-    A lattice is first obtained using fast beam search, num_path are selected
-    and rescored using a given language model and a rnn-lm.
-    The shortest path within the lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi.
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      ngram_lm_scale_list:
-        A list of floats representing LM score scales.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      G:
-        An FsaVec containing only a single FSA. It is an n-gram LM.
-      sp:
-        The BPE model.
-      word_table:
-        The word symbol table.
-      rnn_lm_model:
-        A rnn-lm model used for LM rescoring
-      rnn_lm_scale_list:
-        A list of floats representing RNN score scales.
-      oov_word:
-        OOV words are replaced with this word.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      Return the decoded result in a dict, where the key has the form
-      'ngram_lm_scale_xx' and the value is the decoded results
-      optionally with timestamps. `xx` is the ngram LM scale value
-      used during decoding, i.e., 0.1.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-    # at this point, nbest.fsa.scores are all zeros.
-
-    nbest = nbest.intersect(lattice)
-    # Now nbest.fsa.scores contains acoustic scores
-
-    am_scores = nbest.tot_scores()
-
-    # Now we need to compute the LM scores of each path.
-    # (1) Get the token IDs of each Path. We assume the decoding_graph
-    # is an acceptor, i.e., lattice is also an acceptor
-    tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)  # [path][arc]
-
-    tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.labels.contiguous())
-    tokens = tokens.remove_values_leq(0)  # remove -1 and 0
-
-    token_list: List[List[int]] = tokens.tolist()
-    word_list: List[List[str]] = sp.decode(token_list)
-
-    assert isinstance(oov_word, str), oov_word
-    assert oov_word in word_table, oov_word
-    oov_word_id = word_table[oov_word]
-
-    word_ids_list: List[List[int]] = []
-
-    for words in word_list:
-        this_word_ids = []
-        for w in words.split():
-            if w in word_table:
-                this_word_ids.append(word_table[w])
-            else:
-                this_word_ids.append(oov_word_id)
-        word_ids_list.append(this_word_ids)
-
-    word_fsas = k2.linear_fsa(word_ids_list, device=lattice.device)
-    word_fsas_with_self_loops = k2.add_epsilon_self_loops(word_fsas)
-
-    num_unique_paths = len(word_ids_list)
-
-    b_to_a_map = torch.zeros(
-        num_unique_paths,
-        dtype=torch.int32,
-        device=lattice.device,
-    )
-
-    rescored_word_fsas = k2.intersect_device(
-        a_fsas=G,
-        b_fsas=word_fsas_with_self_loops,
-        b_to_a_map=b_to_a_map,
-        sorted_match_a=True,
-        ret_arc_maps=False,
-    )
-
-    rescored_word_fsas = k2.remove_epsilon_self_loops(rescored_word_fsas)
-    rescored_word_fsas = k2.top_sort(k2.connect(rescored_word_fsas))
-    ngram_lm_scores = rescored_word_fsas.get_tot_scores(
-        use_double_scores=True,
-        log_semiring=False,
-    )
-
-    # Now RNN-LM
-    blank_id = model.decoder.blank_id
-    sos_id = sp.piece_to_id("sos_id")
-    eos_id = sp.piece_to_id("eos_id")
-
-    sos_tokens = add_sos(tokens, sos_id)
-    tokens_eos = add_eos(tokens, eos_id)
-    sos_tokens_row_splits = sos_tokens.shape.row_splits(1)
-    sentence_lengths = sos_tokens_row_splits[1:] - sos_tokens_row_splits[:-1]
-
-    x_tokens = sos_tokens.pad(mode="constant", padding_value=blank_id)
-    y_tokens = tokens_eos.pad(mode="constant", padding_value=blank_id)
-
-    x_tokens = x_tokens.to(torch.int64)
-    y_tokens = y_tokens.to(torch.int64)
-    sentence_lengths = sentence_lengths.to(torch.int64)
-
-    rnn_lm_nll = rnn_lm_model(x=x_tokens, y=y_tokens, lengths=sentence_lengths)
-    assert rnn_lm_nll.ndim == 2
-    assert rnn_lm_nll.shape[0] == len(token_list)
-    rnn_lm_scores = -1 * rnn_lm_nll.sum(dim=1)
-
-    ans: Dict[str, List[List[int]]] = {}
-    for n_scale in ngram_lm_scale_list:
-        for rnn_scale in rnn_lm_scale_list:
-            key = f"ngram_lm_scale_{n_scale}_rnn_lm_scale_{rnn_scale}"
-            tot_scores = (
-                am_scores.values + n_scale * ngram_lm_scores + rnn_scale * rnn_lm_scores
-            )
-            ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-            max_indexes = ragged_tot_scores.argmax()
-            best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-            if not return_timestamps:
-                ans[key] = get_texts(best_path)
-            else:
-                ans[key] = get_texts_with_timestamp(best_path)
-
-    return ans
-
-
-def modified_beam_search_ngram_rescoring(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    ngram_lm: NgramLm,
-    ngram_lm_scale: float,
-    beam: int = 4,
-    temperature: float = 1.0,
-) -> List[List[int]]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-    lm_scale = ngram_lm_scale
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                state_cost=NgramLmStateCost(ngram_lm),
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for batch_size in batch_size_list:
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [
-                hyp.log_prob.reshape(1, 1) + hyp.state_cost.lm_score * lm_scale
-                for hyps in A
-                for hyp in hyps
-            ]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-        vocab_size = log_probs.size(-1)
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    state_cost = hyp.state_cost.forward_one_step(new_token)
-                else:
-                    state_cost = hyp.state_cost
-
-                # We only keep AM scores in new_hyp.log_prob
-                new_log_prob = topk_log_probs[k] - hyp.state_cost.lm_score * lm_scale
-
-                new_hyp = Hypothesis(
-                    ys=new_ys, log_prob=new_log_prob, state_cost=state_cost
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    ans = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-
-    return ans
-
-
-def modified_beam_search_LODR(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LODR_lm: NgramLm,
-    LODR_lm_scale: float,
-    LM: LmScorer,
-    beam: int = 4,
-    context_graph: Optional[ContextGraph] = None,
-) -> List[List[int]]:
-    """This function implements LODR (https://arxiv.org/abs/2203.16776) with
-    `modified_beam_search`. It uses a bi-gram language model as the estimate
-    of the internal language model and subtracts its score during shallow fusion
-    with an external language model. This implementation uses a RNNLM as the
-    external language model.
-
-    Args:
-        model (Transducer):
-            The transducer model
-        encoder_out (torch.Tensor):
-            Encoder output in (N,T,C)
-        encoder_out_lens (torch.Tensor):
-            A 1-D tensor of shape (N,), containing the number of
-            valid frames in encoder_out before padding.
-        LODR_lm:
-            A low order n-gram LM, whose score will be subtracted during shallow fusion
-        LODR_lm_scale:
-            The scale of the LODR_lm
-        LM:
-            A neural net LM, e.g an RNNLM or transformer LM
-        beam (int, optional):
-            Beam size. Defaults to 4.
-
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-    assert LM is not None
-    lm_scale = LM.lm_scale
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    sos_id = getattr(LM, "sos_id", 1)
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    # get initial lm score and lm state by scoring the "sos" token
-    sos_token = torch.tensor([[sos_id]]).to(torch.int64).to(device)
-    lens = torch.tensor([1]).to(device)
-    init_score, init_states = LM.score_token(sos_token, lens)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                state=init_states,  # state of the NN LM
-                lm_score=init_score.reshape(-1),
-                state_cost=NgramLmStateCost(
-                    LODR_lm
-                ),  # state of the source domain ngram
-                context_state=None if context_graph is None else context_graph.root,
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for batch_size in batch_size_list:
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]  # get batch
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-        """
-        for all hyps with a non-blank new token, score this token.
-        It is a little confusing here because this for-loop
-        looks very similar to the one below. Here, we go through all
-        top-k tokens and only add the non-blanks ones to the token_list.
-        LM will score those tokens given the LM states. Note that
-        the variable `scores` is the LM score after seeing the new
-        non-blank token.
-        """
-        token_list = []
-        hs = []
-        cs = []
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_token = topk_token_indexes[k]
-                if new_token not in (blank_id, unk_id):
-                    if LM.lm_type == "rnn":
-                        token_list.append([new_token])
-                        # store the LSTM states
-                        hs.append(hyp.state[0])
-                        cs.append(hyp.state[1])
-                    else:
-                        # for transformer LM
-                        token_list.append(
-                            [sos_id] + hyp.ys[context_size:] + [new_token]
-                        )
-
-        # forward NN LM to get new states and scores
-        if len(token_list) != 0:
-            x_lens = torch.tensor([len(tokens) for tokens in token_list]).to(device)
-            if LM.lm_type == "rnn":
-                tokens_to_score = (
-                    torch.tensor(token_list).to(torch.int64).to(device).reshape(-1, 1)
-                )
-                hs = torch.cat(hs, dim=1).to(device)
-                cs = torch.cat(cs, dim=1).to(device)
-                state = (hs, cs)
-            else:
-                # for transformer LM
-                tokens_list = [torch.tensor(tokens) for tokens in token_list]
-                tokens_to_score = (
-                    torch.nn.utils.rnn.pad_sequence(
-                        tokens_list, batch_first=True, padding_value=0.0
-                    )
-                    .to(device)
-                    .to(torch.int64)
-                )
-
-                state = None
-
-            scores, lm_states = LM.score_token(tokens_to_score, x_lens, state)
-
-        count = 0  # index, used to locate score and lm states
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                ys = hyp.ys[:]
-
-                # current score of hyp
-                lm_score = hyp.lm_score
-                state = hyp.state
-
-                hyp_log_prob = topk_log_probs[k]  # get score of current hyp
-                new_token = topk_token_indexes[k]
-
-                context_score = 0
-                new_context_state = None if context_graph is None else hyp.context_state
-                if new_token not in (blank_id, unk_id):
-                    if context_graph is not None:
-                        (
-                            context_score,
-                            new_context_state,
-                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
-
-                    ys.append(new_token)
-                    state_cost = hyp.state_cost.forward_one_step(new_token)
-
-                    # calculate the score of the latest token
-                    current_ngram_score = state_cost.lm_score - hyp.state_cost.lm_score
-
-                    assert current_ngram_score <= 0.0, (
-                        state_cost.lm_score,
-                        hyp.state_cost.lm_score,
-                    )
-                    # score = score + TDLM_score - LODR_score
-                    # LODR_LM_scale should be a negative number here
-                    hyp_log_prob += (
-                        lm_score[new_token] * lm_scale
-                        + LODR_lm_scale * current_ngram_score
-                        + context_score
-                    )  # add the lm score
-
-                    lm_score = scores[count]
-                    if LM.lm_type == "rnn":
-                        state = (
-                            lm_states[0][:, count, :].unsqueeze(1),
-                            lm_states[1][:, count, :].unsqueeze(1),
-                        )
-                    count += 1
-                else:
-                    state_cost = hyp.state_cost
-
-                new_hyp = Hypothesis(
-                    ys=ys,
-                    log_prob=hyp_log_prob,
-                    state=state,
-                    lm_score=lm_score,
-                    state_cost=state_cost,
-                    context_state=new_context_state,
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # finalize context_state, if the matched contexts do not reach final state
-    # we need to add the score on the corresponding backoff arc
-    if context_graph is not None:
-        finalized_B = [HypothesisList() for _ in range(len(B))]
-        for i, hyps in enumerate(B):
-            for hyp in list(hyps):
-                context_score, new_context_state = context_graph.finalize(
-                    hyp.context_state
-                )
-                finalized_B[i].add(
-                    Hypothesis(
-                        ys=hyp.ys,
-                        log_prob=hyp.log_prob + context_score,
-                        timestamp=hyp.timestamp,
-                        context_state=new_context_state,
-                    )
-                )
-        B = finalized_B
-
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    ans = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-
-    return ans
-
-
-def modified_beam_search_lm_shallow_fusion(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LM: LmScorer,
-    beam: int = 4,
-    return_timestamps: bool = False,
-) -> List[List[int]]:
-    """Modified_beam_search + NN LM shallow fusion
-
-    Args:
-        model (Transducer):
-            The transducer model
-        encoder_out (torch.Tensor):
-            Encoder output in (N,T,C)
-        encoder_out_lens (torch.Tensor):
-            A 1-D tensor of shape (N,), containing the number of
-            valid frames in encoder_out before padding.
-        sp:
-            Sentence piece generator.
-        LM (LmScorer):
-            A neural net LM, e.g RNN or Transformer
-        beam (int, optional):
-            Beam size. Defaults to 4.
-
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-    assert LM is not None
-    lm_scale = LM.lm_scale
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    sos_id = getattr(LM, "sos_id", 1)
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    # get initial lm score and lm state by scoring the "sos" token
-    sos_token = torch.tensor([[sos_id]]).to(torch.int64).to(device)
-    lens = torch.tensor([1]).to(device)
-    init_score, init_states = LM.score_token(sos_token, lens)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                state=init_states,
-                lm_score=init_score.reshape(-1),
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]  # get batch
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )
-
-        lm_scores = torch.cat(
-            [hyp.lm_score.reshape(1, -1) for hyps in A for hyp in hyps]
-        )
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-        """
-        for all hyps with a non-blank new token, score this token.
-        It is a little confusing here because this for-loop
-        looks very similar to the one below. Here, we go through all
-        top-k tokens and only add the non-blanks ones to the token_list.
-        `LM` will score those tokens given the LM states. Note that
-        the variable `scores` is the LM score after seeing the new
-        non-blank token.
-        """
-        token_list = []  # a list of list
-        hs = []
-        cs = []
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_token = topk_token_indexes[k]
-                if new_token not in (blank_id, unk_id):
-                    if LM.lm_type == "rnn":
-                        token_list.append([new_token])
-                        # store the LSTM states
-                        hs.append(hyp.state[0])
-                        cs.append(hyp.state[1])
-                    else:
-                        # for transformer LM
-                        token_list.append(
-                            [sos_id] + hyp.ys[context_size:] + [new_token]
-                        )
-
-        if len(token_list) != 0:
-            x_lens = torch.tensor([len(tokens) for tokens in token_list]).to(device)
-            if LM.lm_type == "rnn":
-                tokens_to_score = (
-                    torch.tensor(token_list).to(torch.int64).to(device).reshape(-1, 1)
-                )
-                hs = torch.cat(hs, dim=1).to(device)
-                cs = torch.cat(cs, dim=1).to(device)
-                state = (hs, cs)
-            else:
-                # for transformer LM
-                tokens_list = [torch.tensor(tokens) for tokens in token_list]
-                tokens_to_score = (
-                    torch.nn.utils.rnn.pad_sequence(
-                        tokens_list, batch_first=True, padding_value=0.0
-                    )
-                    .to(device)
-                    .to(torch.int64)
-                )
-
-                state = None
-
-            scores, lm_states = LM.score_token(tokens_to_score, x_lens, state)
-
-        count = 0  # index, used to locate score and lm states
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                ys = hyp.ys[:]
-
-                lm_score = hyp.lm_score
-                state = hyp.state
-
-                hyp_log_prob = topk_log_probs[k]  # get score of current hyp
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                if new_token not in (blank_id, unk_id):
-                    ys.append(new_token)
-                    new_timestamp.append(t)
-
-                    hyp_log_prob += lm_score[new_token] * lm_scale  # add the lm score
-
-                    lm_score = scores[count]
-                    if LM.lm_type == "rnn":
-                        state = (
-                            lm_states[0][:, count, :].unsqueeze(1),
-                            lm_states[1][:, count, :].unsqueeze(1),
-                        )
-                    count += 1
-
-                new_hyp = Hypothesis(
-                    ys=ys,
-                    log_prob=hyp_log_prob,
-                    state=state,
-                    lm_score=lm_score,
-                    timestamp=new_timestamp,
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    sorted_timestamps = [h.timestamp for h in best_hyps]
-    ans = []
-    ans_timestamps = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-        )
diff --git a/egs/librispeech/SSL/hubert/beam_search.py b/egs/librispeech/SSL/hubert/beam_search.py
new file mode 120000
index 000000000..f4d4b5732
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/beam_search.py
@@ -0,0 +1 @@
+../../ASR/zipformer/beam_search.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/ctc_decode.py b/egs/librispeech/SSL/hubert/ctc_decode.py
index 1f0f9bfac..f3a17be2a 100644
--- a/egs/librispeech/SSL/hubert/ctc_decode.py
+++ b/egs/librispeech/SSL/hubert/ctc_decode.py
@@ -22,39 +22,39 @@
 Usage:
 
 (1) ctc-decoding
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --decoding-method ctc-decoding
 
 (2) 1best
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
     --decoding-method 1best
 
 (3) nbest
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
     --decoding-method nbest
 
 (4) nbest-rescoring
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
@@ -63,10 +63,10 @@ Usage:
     --decoding-method nbest-rescoring
 
 (5) whole-lattice-rescoring
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
@@ -164,7 +164,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="zipformer/exp",
+        default="hubert/exp",
         help="The experiment dir",
     )
 
@@ -340,7 +340,7 @@ def decode_one_batch(
     feature_lens = supervisions["num_frames"].to(device)
 
     if params.causal:
-        # this seems to cause insertions at the end of the utterance if used with zipformer.
+        # this seems to cause insertions at the end of the utterance if used with hubert.
         pad_len = 30
         feature_lens += pad_len
         feature = torch.nn.functional.pad(
diff --git a/egs/librispeech/SSL/hubert/dataset.py b/egs/librispeech/SSL/hubert/dataset.py
index 106b27a2c..c3442df51 100644
--- a/egs/librispeech/SSL/hubert/dataset.py
+++ b/egs/librispeech/SSL/hubert/dataset.py
@@ -92,9 +92,9 @@ class HubertAsrDataset(torch.utils.data.Dataset):
             feature_size=1,
             sampling_rate=16000,
             padding_side="right",
-            padding_value=0.0,
+            padding_value=0,
             do_normalize=True,
-            return_attention_mask=True,
+            return_attention_mask=False,
         )
 
     def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
@@ -148,7 +148,7 @@ if __name__ == "__main__":
     )
 
     for batch_idx, batch in enumerate(dl):
-        import pdb
-
-        pdb.set_trace()
-        pass
+        print(batch["audio"])
+        print(batch["audio_lens"])
+        print(batch["supervisions"]["text"])
+        print(batch["cuts"])
diff --git a/egs/librispeech/SSL/hubert/decoder.py b/egs/librispeech/SSL/hubert/decoder.py
deleted file mode 100644
index 7ce44495b..000000000
--- a/egs/librispeech/SSL/hubert/decoder.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from scaling import Balancer
-
-
-class Decoder(nn.Module):
-    """This class modifies the stateless decoder from the following paper:
-
-        RNN-transducer with stateless prediction network
-        https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
-
-    It removes the recurrent connection from the decoder, i.e., the prediction
-    network. Different from the above paper, it adds an extra Conv1d
-    right after the embedding layer.
-
-    TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
-    """
-
-    def __init__(
-        self,
-        vocab_size: int,
-        decoder_dim: int,
-        blank_id: int,
-        context_size: int,
-    ):
-        """
-        Args:
-          vocab_size:
-            Number of tokens of the modeling unit including blank.
-          decoder_dim:
-            Dimension of the input embedding, and of the decoder output.
-          blank_id:
-            The ID of the blank symbol.
-          context_size:
-            Number of previous words to use to predict the next word.
-            1 means bigram; 2 means trigram. n means (n+1)-gram.
-        """
-        super().__init__()
-
-        self.embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=decoder_dim,
-        )
-        # the balancers are to avoid any drift in the magnitude of the
-        # embeddings, which would interact badly with parameter averaging.
-        self.balancer = Balancer(
-            decoder_dim,
-            channel_dim=-1,
-            min_positive=0.0,
-            max_positive=1.0,
-            min_abs=0.5,
-            max_abs=1.0,
-            prob=0.05,
-        )
-
-        self.blank_id = blank_id
-
-        assert context_size >= 1, context_size
-        self.context_size = context_size
-        self.vocab_size = vocab_size
-
-        if context_size > 1:
-            self.conv = nn.Conv1d(
-                in_channels=decoder_dim,
-                out_channels=decoder_dim,
-                kernel_size=context_size,
-                padding=0,
-                groups=decoder_dim // 4,  # group size == 4
-                bias=False,
-            )
-            self.balancer2 = Balancer(
-                decoder_dim,
-                channel_dim=-1,
-                min_positive=0.0,
-                max_positive=1.0,
-                min_abs=0.5,
-                max_abs=1.0,
-                prob=0.05,
-            )
-        else:
-            # To avoid `RuntimeError: Module 'Decoder' has no attribute 'conv'`
-            # when inference with torch.jit.script and context_size == 1
-            self.conv = nn.Identity()
-            self.balancer2 = nn.Identity()
-
-    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
-        """
-        Args:
-          y:
-            A 2-D tensor of shape (N, U).
-          need_pad:
-            True to left pad the input. Should be True during training.
-            False to not pad the input. Should be False during inference.
-        Returns:
-          Return a tensor of shape (N, U, decoder_dim).
-        """
-        y = y.to(torch.int64)
-        # this stuff about clamp() is a temporary fix for a mismatch
-        # at utterance start, we use negative ids in beam_search.py
-        embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1)
-
-        embedding_out = self.balancer(embedding_out)
-
-        if self.context_size > 1:
-            embedding_out = embedding_out.permute(0, 2, 1)
-            if need_pad is True:
-                embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0))
-            else:
-                # During inference time, there is no need to do extra padding
-                # as we only need one output
-                assert embedding_out.size(-1) == self.context_size
-            embedding_out = self.conv(embedding_out)
-            embedding_out = embedding_out.permute(0, 2, 1)
-            embedding_out = F.relu(embedding_out)
-            embedding_out = self.balancer2(embedding_out)
-
-        return embedding_out
diff --git a/egs/librispeech/SSL/hubert/decoder.py b/egs/librispeech/SSL/hubert/decoder.py
new file mode 120000
index 000000000..a2138e5da
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/decoder.py
@@ -0,0 +1 @@
+../../ASR/zipformer/decoder.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/finetune.py b/egs/librispeech/SSL/hubert/finetune.py
index 612a8a235..0c0095f9f 100644
--- a/egs/librispeech/SSL/hubert/finetune.py
+++ b/egs/librispeech/SSL/hubert/finetune.py
@@ -64,7 +64,6 @@ from lhotse.utils import fix_random_seed
 from model import AsrModel
 from optim import Eden, ScaledAdam
 from scaling import ScheduledFloat
-from subsampling import Conv2dSubsampling
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -152,7 +151,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--do-stable-layer-norm",
         type=str2bool,
-        default=True,
+        default=False,
     )
     parser.add_argument(
         "--feat-extract-activation",
@@ -162,12 +161,12 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--feat-extract-norm",
         type=str,
-        default="layer",
+        default="group",
     )
     parser.add_argument(
         "--feat-proj-dropout",
         type=float,
-        default=0.0,
+        default=0.1,
     )
     parser.add_argument(
         "--feat-proj-layer-norm",
@@ -192,7 +191,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--hidden-size",
         type=int,
-        default=1024,
+        default=768,
     )
     parser.add_argument(
         "--initializer-range",
@@ -202,7 +201,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--intermediate-size",
         type=int,
-        default=4096,
+        default=3072,
     )
     parser.add_argument(
         "--layer-norm-eps",
@@ -247,7 +246,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--num-attention-heads",
         type=int,
-        default=16,
+        default=12,
     )
     parser.add_argument(
         "--num-conv-pos-embedding-groups",
@@ -262,14 +261,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--num-hidden-layers",
         type=int,
-        default=24,
-    )
-
-    parser.add_argument(
-        "--encoder-dim",
-        type=int,
-        default=1024,
-        help="Embedding dimension in encoder model.",
+        default=12,
     )
 
     parser.add_argument(
@@ -366,6 +358,14 @@ def get_parser():
         """,
     )
 
+    parser.add_argument(
+        "--pretrained-dir",
+        type=str,
+        default="download/hubert-base-ls960",
+        help="""The pretrained model dir.
+        It specifies the directory where the pretrained checkpoint is saved.""",
+    )
+
     parser.add_argument(
         "--bpe-model",
         type=str,
@@ -657,7 +657,7 @@ def get_decoder_model(params: AttributeDict) -> nn.Module:
 
 def get_joiner_model(params: AttributeDict) -> nn.Module:
     joiner = Joiner(
-        encoder_dim=params.encoder_dim,
+        encoder_dim=params.hidden_size,
         decoder_dim=params.decoder_dim,
         joiner_dim=params.joiner_dim,
         vocab_size=params.vocab_size,
@@ -685,7 +685,7 @@ def get_model(params: AttributeDict) -> nn.Module:
         encoder=encoder,
         decoder=decoder,
         joiner=joiner,
-        encoder_dim=params.encoder_dim,
+        encoder_dim=params.hidden_size,
         decoder_dim=params.decoder_dim,
         vocab_size=params.vocab_size,
         use_transducer=params.use_transducer,
@@ -731,6 +731,8 @@ def load_checkpoint_if_available(
     elif params.start_epoch > 1:
         filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
     else:
+        logging.info(f"Loading {params.pretrained_dir}")
+        model.encoder = HubertModel.from_pretrained(params.pretrained_dir)
         return None
 
     assert filename.is_file(), f"{filename} does not exist!"
diff --git a/egs/librispeech/SSL/hubert/joiner.py b/egs/librispeech/SSL/hubert/joiner.py
deleted file mode 100644
index dfb0a0057..000000000
--- a/egs/librispeech/SSL/hubert/joiner.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-from scaling import ScaledLinear
-
-
-class Joiner(nn.Module):
-    def __init__(
-        self,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        super().__init__()
-
-        self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim, initial_scale=0.25)
-        self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim, initial_scale=0.25)
-        self.output_linear = nn.Linear(joiner_dim, vocab_size)
-
-    def forward(
-        self,
-        encoder_out: torch.Tensor,
-        decoder_out: torch.Tensor,
-        project_input: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-          encoder_out:
-            Output from the encoder. Its shape is (N, T, s_range, C).
-          decoder_out:
-            Output from the decoder. Its shape is (N, T, s_range, C).
-           project_input:
-            If true, apply input projections encoder_proj and decoder_proj.
-            If this is false, it is the user's responsibility to do this
-            manually.
-        Returns:
-          Return a tensor of shape (N, T, s_range, C).
-        """
-        assert encoder_out.ndim == decoder_out.ndim, (
-            encoder_out.shape,
-            decoder_out.shape,
-        )
-
-        if project_input:
-            logit = self.encoder_proj(encoder_out) + self.decoder_proj(decoder_out)
-        else:
-            logit = encoder_out + decoder_out
-
-        logit = self.output_linear(torch.tanh(logit))
-
-        return logit
diff --git a/egs/librispeech/SSL/hubert/joiner.py b/egs/librispeech/SSL/hubert/joiner.py
new file mode 120000
index 000000000..aa3362cda
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/joiner.py
@@ -0,0 +1 @@
+../../ASR/zipformer/joiner.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/optim.py b/egs/librispeech/SSL/hubert/optim.py
deleted file mode 100644
index b83359a1a..000000000
--- a/egs/librispeech/SSL/hubert/optim.py
+++ /dev/null
@@ -1,1244 +0,0 @@
-#      Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import logging
-import random
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-from lhotse.utils import fix_random_seed
-from torch import Tensor, nn
-from torch.optim import Optimizer
-
-
-class BatchedOptimizer(Optimizer):
-    """
-    This class adds to class Optimizer the capability to optimize parameters in batches:
-    it will stack the parameters and their grads for you so the optimizer can work
-    on tensors with an extra leading dimension.  This is intended for speed with GPUs,
-    as it reduces the number of kernels launched in the optimizer.
-
-    Args:
-      params:
-    """
-
-    def __init__(self, params, defaults):
-        super(BatchedOptimizer, self).__init__(params, defaults)
-
-    @contextlib.contextmanager
-    def batched_params(self, param_group, group_params_names):
-        """
-        This function returns (technically, yields) a list of
-          of tuples (p, state), where
-        p is a `fake` parameter that is stacked (over axis 0) from real parameters
-        that share the same shape, and its gradient is also stacked;
-        `state` is the state corresponding to this batch of parameters
-        (it will be physically located in the "state" for one of the real
-        parameters, the last one that has any particular shape and dtype).
-
-        This function is decorated as a context manager so that it can
-        write parameters back to their "real" locations.
-
-        The idea is, instead of doing:
-        <code>
-          for p in group["params"]:
-             state = self.state[p]
-             ...
-        </code>
-        you can do:
-        <code>
-          with self.batched_params(group["params"]) as batches:
-             for p, state, p_names in batches:
-                 ...
-        </code>
-
-        Args:
-          group: a parameter group, which is a list of parameters; should be
-                one of self.param_groups.
-          group_params_names: name for each parameter in group,
-                which is List[str].
-        """
-        batches = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
-        batches_names = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
-
-        assert len(param_group) == len(group_params_names)
-        for p, named_p in zip(param_group, group_params_names):
-            key = (str(p.dtype), *p.shape)
-            batches[key].append(p)
-            batches_names[key].append(named_p)
-
-        batches_names_keys = list(batches_names.keys())
-        sorted_idx = sorted(
-            range(len(batches_names)), key=lambda i: batches_names_keys[i]
-        )
-        batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
-        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
-
-        stacked_params_dict = dict()
-
-        # turn batches into a list, in deterministic order.
-        # tuples will contain tuples of (stacked_param, state, stacked_params_names),
-        # one for each batch in `batches`.
-        tuples = []
-
-        for batch, batch_names in zip(batches, batches_names):
-            p = batch[0]
-            # we arbitrarily store the state in the
-            # state corresponding to the 1st parameter in the
-            # group.  class Optimizer will take care of saving/loading state.
-            state = self.state[p]
-            p_stacked = torch.stack(batch)
-            grad = torch.stack(
-                [torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
-            )
-            p_stacked.grad = grad
-            stacked_params_dict[key] = p_stacked
-            tuples.append((p_stacked, state, batch_names))
-
-        yield tuples  # <-- calling code will do the actual optimization here!
-
-        for (stacked_params, _state, _names), batch in zip(tuples, batches):
-            for i, p in enumerate(batch):  # batch is list of Parameter
-                p.copy_(stacked_params[i])
-
-
-class ScaledAdam(BatchedOptimizer):
-    """
-     Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
-     proportional to the norm of that parameter; and also learn the scale of the parameter,
-     in log space, subject to upper and lower limits (as if we had factored each parameter as
-     param = underlying_param * log_scale.exp())
-
-
-     Args:
-          params:  The parameters or param_groups to optimize (like other Optimizer subclasses)
-                   Unlike common optimizers, which accept model.parameters() or groups of parameters(),
-                   this optimizer could accept model.named_parameters() or groups of named_parameters().
-                   See comments of function _get_names_of_parameters for its 4 possible cases.
-              lr:  The learning rate.  We will typically use a learning rate schedule that starts
-                   at 0.03 and decreases over time, i.e. much higher than other common
-                   optimizers.
-     clipping_scale: (e.g. 2.0)
-                   A scale for gradient-clipping: if specified, the normalized gradients
-                   over the whole model will be clipped to have 2-norm equal to
-                   `clipping_scale` times the median 2-norm over the most recent period
-                   of `clipping_update_period` minibatches.  By "normalized gradients",
-                   we mean after multiplying by the rms parameter value for this tensor
-                   [for non-scalars]; this is appropriate because our update is scaled
-                   by this quantity.
-            betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
-                   Must satisfy 0 < beta <= beta2 < 1.
-     scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
-                   scale of each parameter tensor and scalar parameters of the mode..
-                   If each parameter were decomposed
-                   as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
-                   would be a the scaling factor on the learning rate of p_scale.
-              eps:  A general-purpose epsilon to prevent division by zero
-    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be >= this value)
-    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be <= this value)
-       scalar_max: Maximum absolute value for scalar parameters (applicable if your
-                   model has any parameters with numel() == 1).
-    size_update_period: The periodicity, in steps, with which we update the size (scale)
-                   of the parameter tensor.  This is provided to save a little time
-                   in the update.
-     clipping_update_period: if clipping_scale is specified, this is the period
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=3e-02,
-        clipping_scale=None,
-        betas=(0.9, 0.98),
-        scalar_lr_scale=0.1,
-        eps=1.0e-08,
-        param_min_rms=1.0e-05,
-        param_max_rms=3.0,
-        scalar_max=10.0,
-        size_update_period=4,
-        clipping_update_period=100,
-    ):
-        defaults = dict(
-            lr=lr,
-            clipping_scale=clipping_scale,
-            betas=betas,
-            scalar_lr_scale=scalar_lr_scale,
-            eps=eps,
-            param_min_rms=param_min_rms,
-            param_max_rms=param_max_rms,
-            scalar_max=scalar_max,
-            size_update_period=size_update_period,
-            clipping_update_period=clipping_update_period,
-        )
-
-        # If params only contains parameters or group of parameters,
-        # i.e when parameter names are not given,
-        # this flag will be set to False in funciton _get_names_of_parameters.
-        self.show_dominant_parameters = True
-        param_groups, parameters_names = self._get_names_of_parameters(params)
-        super(ScaledAdam, self).__init__(param_groups, defaults)
-        assert len(self.param_groups) == len(parameters_names)
-        self.parameters_names = parameters_names
-
-    def _get_names_of_parameters(
-        self, params_or_named_params
-    ) -> Tuple[List[Dict], List[List[str]]]:
-        """
-        Args:
-          params_or_named_params: according to the way ScaledAdam is initialized in train.py,
-            this argument could be one of following 4 cases,
-            case 1, a generator of parameter, e.g.:
-              optimizer = ScaledAdam(model.parameters(), lr=params.base_lr, clipping_scale=3.0)
-
-            case 2, a list of parameter groups with different config, e.g.:
-              model_param_groups = [
-                      {'params': model.encoder.parameters(), 'lr': 0.05},
-                      {'params': model.decoder.parameters(), 'lr': 0.01},
-                      {'params': model.joiner.parameters(), 'lr': 0.03},
-                      ]
-              optimizer = ScaledAdam(model_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
-            case 3, a generator of named_parameter, e.g.:
-              optimizer = ScaledAdam(model.named_parameters(), lr=params.base_lr, clipping_scale=3.0)
-
-            case 4, a list of named_parameter groups with different config, e.g.:
-              model_named_param_groups = [
-                      {'named_params': model.encoder.named_parameters(), 'lr': 0.05},
-                      {'named_params': model.decoder.named_parameters(), 'lr': 0.01},
-                      {'named_params': model.joiner.named_parameters(), 'lr': 0.03},
-                      ]
-              optimizer = ScaledAdam(model_named_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
-          For case 1 and case 2, input params is used to initialize the underlying torch.optimizer.
-          For case 3 and case 4, firstly, names and params are extracted from input named_params,
-            then, these extracted params are used to initialize the underlying torch.optimizer,
-            and these extracted names are mainly used by function
-            `_show_gradient_dominating_parameter`
-
-        Returns:
-          Returns a tuple containing 2 elements:
-            - `param_groups` with type List[Dict], each Dict element is a parameter group.
-              An example of `param_groups` could be:
-              [
-                  {'params': `one iterable of Parameter`, 'lr': 0.05},
-                  {'params': `another iterable of Parameter`, 'lr': 0.08},
-                  {'params': `a third iterable of Parameter`, 'lr': 0.1},
-              ]
-            - `param_gruops_names` with type List[List[str]],
-               each `List[str]` is for a group['params'] in param_groups,
-               and each `str` is the name of a parameter.
-               A dummy name "foo" is related to each parameter,
-               if input are params without names, i.e. case 1 or case 2.
-        """
-        # variable naming convention in this function:
-        #   p is short for param.
-        #   np is short for named_param.
-        #   p_or_np is short for param_or_named_param.
-        #   cur is short for current.
-        #   group is a dict, e.g. {'params': iterable of parameter, 'lr': 0.05, other fields}.
-        #   groups is a List[group]
-
-        iterable_or_groups = list(params_or_named_params)
-        if len(iterable_or_groups) == 0:
-            raise ValueError("optimizer got an empty parameter list")
-
-        # The first value of returned tuple.  A list of dicts containing at
-        # least 'params' as a key.
-        param_groups = []
-
-        # The second value of returned tuple,
-        # a List[List[str]], each sub-List is for a group.
-        param_groups_names = []
-
-        if not isinstance(iterable_or_groups[0], dict):
-            # case 1 or case 3,
-            # the input is an iterable of parameter or named parameter.
-            param_iterable_cur_group = []
-            param_names_cur_group = []
-            for p_or_np in iterable_or_groups:
-                if isinstance(p_or_np, tuple):
-                    # case 3
-                    name, param = p_or_np
-                else:
-                    # case 1
-                    assert isinstance(p_or_np, torch.Tensor)
-                    param = p_or_np
-                    # Assign a dummy name as a placeholder
-                    name = "foo"
-                    self.show_dominant_parameters = False
-                param_iterable_cur_group.append(param)
-                param_names_cur_group.append(name)
-            param_groups.append({"params": param_iterable_cur_group})
-            param_groups_names.append(param_names_cur_group)
-        else:
-            # case 2 or case 4
-            # the input is groups of parameter or named parameter.
-            for cur_group in iterable_or_groups:
-                assert "named_params" in cur_group
-                name_list = [x[0] for x in cur_group["named_params"]]
-                p_list = [x[1] for x in cur_group["named_params"]]
-                del cur_group["named_params"]
-                cur_group["params"] = p_list
-                param_groups.append(cur_group)
-                param_groups_names.append(name_list)
-
-        return param_groups, param_groups_names
-
-    def __setstate__(self, state):
-        super(ScaledAdam, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        batch = True
-
-        for group, group_params_names in zip(self.param_groups, self.parameters_names):
-            with self.batched_params(group["params"], group_params_names) as batches:
-                # batches is list of pairs (stacked_param, state).  stacked_param is like
-                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
-                # a stacking dim, it is not a real dim.
-
-                if (
-                    len(batches[0][1]) == 0
-                ):  # if len(first state) == 0: not yet initialized
-                    clipping_scale = 1
-                else:
-                    clipping_scale = self._get_clipping_scale(group, batches)
-
-                for p, state, _ in batches:
-                    # Perform optimization step.
-                    # grad is not going to be None, we handled that when creating the batches.
-                    grad = p.grad
-                    if grad.is_sparse:
-                        raise RuntimeError(
-                            "ScaledAdam optimizer does not support sparse gradients"
-                        )
-                    # State initialization
-                    if len(state) == 0:
-                        self._init_state(group, p, state)
-
-                    self._step_one_batch(group, p, state, clipping_scale)
-
-        return loss
-
-    def _init_state(self, group: dict, p: Tensor, state: dict):
-        """
-        Initializes state dict for parameter 'p'.  Assumes that dim 0 of tensor p
-        is actually the batch dimension, corresponding to batched-together
-        parameters of a given shape.
-
-
-        Args:
-           group:   Dict to look up configuration values.
-               p: The parameter that we are initializing the state for
-           state: Dict from string to whatever state we are initializing
-        """
-        size_update_period = group["size_update_period"]
-
-        state["step"] = 0
-
-        kwargs = {"device": p.device, "dtype": p.dtype}
-
-        # 'delta' implements conventional momentum.  There are
-        # several different kinds of update going on, so rather than
-        # compute "exp_avg" like in Adam, we store and decay a
-        # parameter-change "delta", which combines all forms of
-        # update.  this is equivalent to how it's done in Adam,
-        # except for the first few steps.
-        state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-
-        if numel > 1:
-            # "param_rms" just periodically records the scalar root-mean-square value of
-            # the parameter tensor.
-            # it has a shape like (batch_size, 1, 1, 1, 1)
-            param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
-            state["param_rms"] = param_rms
-
-            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
-            state["scale_grads"] = torch.zeros(
-                size_update_period, *param_rms.shape, **kwargs
-            )
-
-        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
-        state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-    def _get_clipping_scale(
-        self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
-    ) -> float:
-        """
-        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
-        by this amount before applying the rest of the update.
-
-        Args:
-           group: the parameter group, an item in self.param_groups
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-        """
-        assert len(tuples) >= 1
-        clipping_scale = group["clipping_scale"]
-        (first_p, first_state, _) = tuples[0]
-        step = first_state["step"]
-        if clipping_scale is None or step == 0:
-            # no clipping.  return early on step == 0 because the other
-            # parameters' state won't have been initialized yet.
-            return 1.0
-        clipping_update_period = group["clipping_update_period"]
-        scalar_lr_scale = group["scalar_lr_scale"]
-
-        tot_sumsq = torch.tensor(0.0, device=first_p.device)
-        for p, state, param_names in tuples:
-            grad = p.grad
-            if grad.is_sparse:
-                raise RuntimeError(
-                    "ScaledAdam optimizer does not support sparse gradients"
-                )
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                tot_sumsq += (grad**2).sum() * (
-                    scalar_lr_scale**2
-                )  # sum() to change shape [1] to []
-            else:
-                tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
-
-        tot_norm = tot_sumsq.sqrt()
-        if "model_norms" not in first_state:
-            first_state["model_norms"] = torch.zeros(
-                clipping_update_period, device=p.device
-            )
-        first_state["model_norms"][step % clipping_update_period] = tot_norm
-
-        irregular_estimate_steps = [
-            i for i in [10, 20, 40] if i < clipping_update_period
-        ]
-        if step % clipping_update_period == 0 or step in irregular_estimate_steps:
-            # Print some stats.
-            # We don't reach here if step == 0 because we would have returned
-            # above.
-            sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
-            if step in irregular_estimate_steps:
-                sorted_norms = sorted_norms[-step:]
-            num_norms = sorted_norms.numel()
-            quartiles = []
-            for n in range(0, 5):
-                index = min(num_norms - 1, (num_norms // 4) * n)
-                quartiles.append(sorted_norms[index].item())
-
-            median = quartiles[2]
-            if median - median != 0:
-                raise RuntimeError("Too many grads were not finite")
-            threshold = clipping_scale * median
-            if step in irregular_estimate_steps:
-                # use larger thresholds on first few steps of estimating threshold,
-                # as norm may be changing rapidly.
-                threshold = threshold * 2.0
-            first_state["model_norm_threshold"] = threshold
-            percent_clipped = (
-                first_state["num_clipped"] * 100.0 / num_norms
-                if "num_clipped" in first_state
-                else 0.0
-            )
-            first_state["num_clipped"] = 0
-            quartiles = " ".join(["%.3e" % x for x in quartiles])
-            logging.warn(
-                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
-                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
-            )
-
-        try:
-            model_norm_threshold = first_state["model_norm_threshold"]
-        except KeyError:
-            return 1.0  # threshold has not yet been set.
-
-        ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
-        if ans != ans:  # e.g. ans is nan
-            ans = 0.0
-        if ans < 1.0:
-            first_state["num_clipped"] += 1
-        if ans < 0.1:
-            logging.warn(
-                f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
-            )
-            if self.show_dominant_parameters:
-                assert p.shape[0] == len(param_names)
-                self._show_gradient_dominating_parameter(
-                    tuples, tot_sumsq, group["scalar_lr_scale"]
-                )
-
-        if ans == 0.0:
-            for p, state, param_names in tuples:
-                p.grad.zero_()  # get rid of infinity()
-
-        return ans
-
-    def _show_gradient_dominating_parameter(
-        self,
-        tuples: List[Tuple[Tensor, dict, List[str]]],
-        tot_sumsq: Tensor,
-        scalar_lr_scale: float,
-    ):
-        """
-        Show information of parameter which dominates tot_sumsq.
-
-        Args:
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-            tot_sumsq: sumsq of all parameters. Though it's could be calculated
-                from tuples, we still pass it to save some time.
-        """
-        all_sumsq_orig = {}
-        for p, state, batch_param_names in tuples:
-            # p is a stacked batch parameters.
-            batch_grad = p.grad
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                # Dummy values used by following `zip` statement.
-                batch_rms_orig = torch.full(
-                    p.shape, scalar_lr_scale, device=batch_grad.device
-                )
-            else:
-                batch_rms_orig = state["param_rms"]
-            batch_sumsq_orig = (batch_grad * batch_rms_orig) ** 2
-            if batch_grad.ndim > 1:
-                # need to guard it with if-statement because sum() sums over
-                # all dims if dim == ().
-                batch_sumsq_orig = batch_sumsq_orig.sum(
-                    dim=list(range(1, batch_grad.ndim))
-                )
-            for name, sumsq_orig, rms, grad in zip(
-                batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
-            ):
-                proportion_orig = sumsq_orig / tot_sumsq
-                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
-
-        sorted_by_proportion = {
-            k: v
-            for k, v in sorted(
-                all_sumsq_orig.items(), key=lambda item: item[1][0], reverse=True
-            )
-        }
-        dominant_param_name = next(iter(sorted_by_proportion))
-        (
-            dominant_proportion,
-            dominant_sumsq,
-            dominant_rms,
-            dominant_grad,
-        ) = sorted_by_proportion[dominant_param_name]
-        logging.warn(
-            f"Parameter dominating tot_sumsq {dominant_param_name}"
-            f" with proportion {dominant_proportion:.2f},"
-            f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
-            f"={dominant_sumsq:.3e},"
-            f" grad_sumsq={(dominant_grad**2).sum():.3e},"
-            f" orig_rms_sq={(dominant_rms**2).item():.3e}"
-        )
-
-    def _step_one_batch(
-        self, group: dict, p: Tensor, state: dict, clipping_scale: float
-    ):
-        """
-        Do the step for one parameter, which is actually going to be a batch of
-        `real` parameters, with dim 0 as the batch dim.
-        Args:
-                  group:  dict to look up configuration values
-                    p: parameter to update (actually multiple parameters stacked together
-                       as a batch)
-                  state: state-dict for p, to look up the optimizer state
-        """
-        lr = group["lr"]
-        size_update_period = group["size_update_period"]
-        beta1 = group["betas"][0]
-
-        grad = p.grad
-        if clipping_scale != 1.0:
-            grad *= clipping_scale
-        step = state["step"]
-        delta = state["delta"]
-
-        delta.mul_(beta1)
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-        if numel > 1:
-            # Update the size/scale of p, and set param_rms
-            scale_grads = state["scale_grads"]
-            scale_grads[step % size_update_period] = (p * grad).sum(
-                dim=list(range(1, p.ndim)), keepdim=True
-            )
-            if step % size_update_period == size_update_period - 1:
-                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
-                param_rms.copy_(
-                    (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
-                )
-                if step > 0:
-                    # self._size_update() learns the overall scale on the
-                    # parameter, by shrinking or expanding it.
-                    self._size_update(group, scale_grads, p, state)
-
-        if numel == 1:
-            # For parameters with 1 element we just use regular Adam.
-            # Updates delta.
-            self._step_scalar(group, p, state)
-        else:
-            self._step(group, p, state)
-
-        state["step"] = step + 1
-
-    def _size_update(
-        self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
-    ) -> None:
-        """
-               Called only where p.numel() > 1, this updates the scale of the parameter.
-               If we imagine: p =  underlying_param * scale.exp(), and we are doing
-               gradient descent on underlying param and on scale, this function does the update
-               on `scale`.
-
-               Args:
-              group: dict to look up configuration values
-        scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
-                      grads w.r.t. the scales.
-                  p:  The parameter to update
-               state: The state-dict of p
-        """
-
-        param_rms = state["param_rms"]
-        beta1, beta2 = group["betas"]
-        size_lr = group["lr"] * group["scalar_lr_scale"]
-        param_min_rms = group["param_min_rms"]
-        param_max_rms = group["param_max_rms"]
-        eps = group["eps"]
-        step = state["step"]
-        batch_size = p.shape[0]
-
-        size_update_period = scale_grads.shape[0]
-        # correct beta2 for the size update period: we will have
-        # faster decay at this level.
-        beta2_corr = beta2**size_update_period
-
-        scale_exp_avg_sq = state["scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
-        scale_exp_avg_sq.mul_(beta2_corr).add_(
-            (scale_grads**2).mean(dim=0),  # mean over dim `size_update_period`
-            alpha=1 - beta2_corr,
-        )  # shape is (batch_size, 1, 1, ...)
-
-        # The 1st time we reach here is when size_step == 1.
-        size_step = (step + 1) // size_update_period
-        bias_correction2 = 1 - beta2_corr**size_step
-        # we don't bother with bias_correction1; this will help prevent divergence
-        # at the start of training.
-
-        denom = scale_exp_avg_sq.sqrt() + eps
-
-        scale_step = (
-            -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
-        )
-
-        is_too_small = param_rms < param_min_rms
-
-        # when the param gets too small, just don't shrink it any further.
-        scale_step.masked_fill_(is_too_small, 0.0)
-
-        # and ensure the parameter rms after update never exceeds param_max_rms.
-        # We have to look at the trained model for parameters at or around the
-        # param_max_rms, because sometimes they can indicate a problem with the
-        # topology or settings.
-        scale_step = torch.minimum(scale_step, (param_max_rms - param_rms) / param_rms)
-
-        delta = state["delta"]
-        # the factor of (1-beta1) relates to momentum.
-        delta.add_(p * scale_step, alpha=(1 - beta1))
-
-    def _step(self, group: dict, p: Tensor, state: dict):
-        """
-        This function does the core update of self.step(), in the case where the members of
-        the batch have more than 1 element.
-
-        Args:
-            group: A dict which will be used to look up configuration values
-                p: The parameter to be updated
-             grad: The grad of p
-            state: The state-dict corresponding to parameter p
-
-        This function modifies p.
-        """
-        grad = p.grad
-        lr = group["lr"]
-        beta1, beta2 = group["betas"]
-        eps = group["eps"]
-        param_min_rms = group["param_min_rms"]
-        step = state["step"]
-
-        exp_avg_sq = state["exp_avg_sq"]
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
-
-        this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
-        bias_correction2 = 1 - beta2 ** (this_step + 1)
-        if bias_correction2 < 0.99:
-            # note: not in-place.
-            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
-
-        denom = exp_avg_sq.sqrt()
-        denom += eps
-        grad = grad / denom
-
-        alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
-
-        delta = state["delta"]
-        delta.add_(grad * alpha)
-        p.add_(delta)
-
-    def _step_scalar(self, group: dict, p: Tensor, state: dict):
-        """
-        A simplified form of the core update for scalar tensors, where we cannot get a good
-        estimate of the parameter rms.
-        """
-        beta1, beta2 = group["betas"]
-        scalar_max = group["scalar_max"]
-        eps = group["eps"]
-        lr = group["lr"] * group["scalar_lr_scale"]
-        grad = p.grad
-
-        exp_avg_sq = state["exp_avg_sq"]  # shape: (batch_size,)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-
-        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
-        # slower update at the start will help stability anyway.
-        bias_correction2 = 1 - beta2 ** (state["step"] + 1)
-        denom = (exp_avg_sq / bias_correction2).sqrt() + eps
-
-        delta = state["delta"]
-        delta.add_(grad / denom, alpha=-lr * (1 - beta1))
-        p.clamp_(min=-scalar_max, max=scalar_max)
-        p.add_(delta)
-
-
-class LRScheduler(object):
-    """
-    Base-class for learning rate schedulers where the learning-rate depends on both the
-    batch and the epoch.
-    """
-
-    def __init__(self, optimizer: Optimizer, verbose: bool = False):
-        # Attach optimizer
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
-        self.optimizer = optimizer
-        self.verbose = verbose
-
-        for group in optimizer.param_groups:
-            group.setdefault("base_lr", group["lr"])
-
-        self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
-
-        self.epoch = 0
-        self.batch = 0
-
-    def state_dict(self):
-        """Returns the state of the scheduler as a :class:`dict`.
-
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
-        """
-        return {
-            "base_lrs": self.base_lrs,
-            "epoch": self.epoch,
-            "batch": self.batch,
-        }
-
-    def load_state_dict(self, state_dict):
-        """Loads the schedulers state.
-
-        Args:
-            state_dict (dict): scheduler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-
-    def get_last_lr(self) -> List[float]:
-        """Return last computed learning rate by current scheduler.  Will be a list of float."""
-        return self._last_lr
-
-    def get_lr(self):
-        # Compute list of learning rates from self.epoch and self.batch and
-        # self.base_lrs; this must be overloaded by the user.
-        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
-        raise NotImplementedError
-
-    def step_batch(self, batch: Optional[int] = None) -> None:
-        # Step the batch index, or just set it.  If `batch` is specified, it
-        # must be the batch index from the start of training, i.e. summed over
-        # all epochs.
-        # You can call this in any order; if you don't provide 'batch', it should
-        # of course be called once per batch.
-        if batch is not None:
-            self.batch = batch
-        else:
-            self.batch = self.batch + 1
-        self._set_lrs()
-
-    def step_epoch(self, epoch: Optional[int] = None):
-        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
-        # you should call this at the start of the epoch; if you don't provide the 'epoch'
-        # arg, you should call it at the end of the epoch.
-        if epoch is not None:
-            self.epoch = epoch
-        else:
-            self.epoch = self.epoch + 1
-        self._set_lrs()
-
-    def _set_lrs(self):
-        values = self.get_lr()
-        assert len(values) == len(self.optimizer.param_groups)
-
-        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
-            param_group, lr = data
-            param_group["lr"] = lr
-            self.print_lr(self.verbose, i, lr)
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
-
-    def print_lr(self, is_verbose, group, lr):
-        """Display the current learning rate."""
-        if is_verbose:
-            logging.warn(
-                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
-                f" of group {group} to {lr:.4e}."
-            )
-
-
-class Eden(LRScheduler):
-    """
-    Eden scheduler.
-    The basic formula (before warmup) is:
-      lr = base_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
-                     (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) * warmup
-    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
-    and then stays constant at 1.
-
-    If you don't have the concept of epochs, or one epoch takes a very long time,
-    you can replace the notion of 'epoch' with some measure of the amount of data
-    processed, e.g. hours of data or frames of data, with 'lr_epochs' being set to
-    some measure representing "quite a lot of data": say, one fifth or one third
-    of an entire training run, but it doesn't matter much.  You could also use
-    Eden2 which has only the notion of batches.
-
-    We suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
-
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-        lr_epochs: the number of epochs after which we start significantly
-              decreasing the learning rate, suggest 6 if you plan to do e.g.
-              20 to 40 epochs, but may need smaller number if dataset is huge
-              and you will do few epochs.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        lr_epochs: Union[int, float],
-        warmup_batches: Union[int, float] = 500.0,
-        warmup_start: float = 0.5,
-        verbose: bool = False,
-    ):
-        super(Eden, self).__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.lr_epochs = lr_epochs
-        self.warmup_batches = warmup_batches
-
-        assert 0.0 <= warmup_start <= 1.0, warmup_start
-        self.warmup_start = warmup_start
-
-    def get_lr(self):
-        factor = (
-            (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
-        ) ** -0.25 * (
-            ((self.epoch**2 + self.lr_epochs**2) / self.lr_epochs**2) ** -0.25
-        )
-        warmup_factor = (
-            1.0
-            if self.batch >= self.warmup_batches
-            else self.warmup_start
-            + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
-            # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
-        )
-
-        return [x * factor * warmup_factor for x in self.base_lrs]
-
-
-class Eden2(LRScheduler):
-    """
-    Eden2 scheduler, simpler than Eden because it does not use the notion of epoch,
-    only batches.
-
-    The basic formula (before warmup) is:
-      lr = base_lr * ((batch**2 + lr_batches**2) / lr_batches**2) ** -0.5) * warmup
-
-    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
-    and then stays constant at 1.
-
-
-     E.g. suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
-
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        warmup_batches: Union[int, float] = 500.0,
-        warmup_start: float = 0.5,
-        verbose: bool = False,
-    ):
-        super().__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.warmup_batches = warmup_batches
-
-        assert 0.0 <= warmup_start <= 1.0, warmup_start
-        self.warmup_start = warmup_start
-
-    def get_lr(self):
-        factor = (
-            (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
-        ) ** -0.5
-        warmup_factor = (
-            1.0
-            if self.batch >= self.warmup_batches
-            else self.warmup_start
-            + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
-            # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
-        )
-
-        return [x * factor * warmup_factor for x in self.base_lrs]
-
-
-def _test_eden():
-    m = torch.nn.Linear(100, 100)
-    optim = ScaledAdam(m.parameters(), lr=0.03)
-
-    scheduler = Eden(optim, lr_batches=100, lr_epochs=2, verbose=True)
-
-    for epoch in range(10):
-        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
-
-        for step in range(20):
-            x = torch.randn(200, 100).detach()
-            x.requires_grad = True
-            y = m(x)
-            dy = torch.randn(200, 100).detach()
-            f = (y * dy).sum()
-            f.backward()
-
-            optim.step()
-            scheduler.step_batch()
-            optim.zero_grad()
-
-    logging.info(f"last lr = {scheduler.get_last_lr()}")
-    logging.info(f"state dict = {scheduler.state_dict()}")
-
-
-# This is included mostly as a baseline for ScaledAdam.
-class Eve(Optimizer):
-    """
-    Implements Eve algorithm.  This is a modified version of AdamW with a special
-    way of setting the weight-decay / shrinkage-factor, which is designed to make the
-    rms of the parameters approach a particular target_rms (default: 0.1).  This is
-    for use with networks with 'scaled' versions of modules (see scaling.py), which
-    will be close to invariant to the absolute scale on the parameter matrix.
-
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-    Eve is unpublished so far.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
-            this value means that the weight would decay significantly after
-            about 3k minibatches.  Is not multiplied by learning rate, but
-            is conditional on RMS-value of parameter being > target_rms.
-        target_rms (float, optional): target root-mean-square value of
-           parameters, if they fall below this we will stop applying weight decay.
-
-
-    .. _Adam: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.98),
-        eps=1e-8,
-        weight_decay=1e-3,
-        target_rms=0.1,
-    ):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0 <= weight_decay <= 0.1:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        if not 0 < target_rms <= 10.0:
-            raise ValueError("Invalid target_rms value: {}".format(target_rms))
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            target_rms=target_rms,
-        )
-        super(Eve, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Eve, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-
-                # Perform optimization step
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError("AdamW does not support sparse gradients")
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
-                    group["eps"]
-                )
-
-                step_size = group["lr"] / bias_correction1
-                target_rms = group["target_rms"]
-                weight_decay = group["weight_decay"]
-
-                if p.numel() > 1:
-                    # avoid applying this weight-decay on "scaling factors"
-                    # (which are scalar).
-                    is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
-                    p.mul_(1 - (weight_decay * is_above_target_rms))
-
-                p.addcdiv_(exp_avg, denom, value=-step_size)
-
-                if random.random() < 0.0005:
-                    step = (exp_avg / denom) * step_size
-                    logging.info(
-                        f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
-                    )
-
-        return loss
-
-
-def _test_scaled_adam(hidden_dim: int):
-    import timeit
-
-    from scaling import ScaledLinear
-
-    E = 100
-    B = 4
-    T = 2
-    logging.info("in test_eve_cain")
-    # device = torch.device('cuda')
-    device = torch.device("cpu")
-    dtype = torch.float32
-
-    fix_random_seed(42)
-    # these input_magnitudes and output_magnitudes are to test that
-    # Abel is working as we expect and is able to adjust scales of
-    # different dims differently.
-    input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-    output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-
-    for iter in [1, 0]:
-        fix_random_seed(42)
-        Linear = torch.nn.Linear if iter == 0 else ScaledLinear
-
-        m = torch.nn.Sequential(
-            Linear(E, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, E),
-        ).to(device)
-
-        train_pairs = [
-            (
-                100.0
-                * torch.randn(B, T, E, device=device, dtype=dtype)
-                * input_magnitudes,
-                torch.randn(B, T, E, device=device, dtype=dtype) * output_magnitudes,
-            )
-            for _ in range(20)
-        ]
-
-        if iter == 0:
-            optim = Eve(m.parameters(), lr=0.003)
-        elif iter == 1:
-            optim = ScaledAdam(m.parameters(), lr=0.03, clipping_scale=2.0)
-        scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)
-
-        start = timeit.default_timer()
-        avg_loss = 0.0
-        for epoch in range(180):
-            scheduler.step_epoch()
-            # if epoch == 100 and iter in [2,3]:
-            #    optim.reset_speedup()  # check it doesn't crash.
-
-            # if epoch == 130:
-            #    opts = diagnostics.TensorDiagnosticOptions(
-            #        512
-            #    )  # allow 4 megabytes per sub-module
-            #    diagnostic = diagnostics.attach_diagnostics(m, opts)
-
-            for n, (x, y) in enumerate(train_pairs):
-                y_out = m(x)
-                loss = ((y_out - y) ** 2).mean() * 100.0
-                if epoch == 0 and n == 0:
-                    avg_loss = loss.item()
-                else:
-                    avg_loss = 0.98 * avg_loss + 0.02 * loss.item()
-                if n == 0 and epoch % 5 == 0:
-                    # norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
-                    # norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()
-                    # norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item()
-                    # norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item()
-                    # scale1 = '%.2e' % (m[0].weight_scale.exp().item())
-                    # scale1b = '%.2e' % (m[0].bias_scale.exp().item())
-                    # scale2 = '%.2e' % (m[2].weight_scale.exp().item())
-                    # scale2b = '%.2e' % (m[2].bias_scale.exp().item())
-                    lr = scheduler.get_last_lr()[0]
-                    logging.info(
-                        f"Iter {iter}, epoch {epoch}, batch {n}, avg_loss {avg_loss:.4g}, lr={lr:.4e}"
-                    )  # , norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b}
-                loss.log().backward()
-                optim.step()
-                optim.zero_grad()
-                scheduler.step_batch()
-
-        # diagnostic.print_diagnostics()
-
-        stop = timeit.default_timer()
-        logging.info(f"Iter={iter}, Time taken: {stop - start}")
-
-        logging.info(f"last lr = {scheduler.get_last_lr()}")
-        # logging.info("state dict = ", scheduler.state_dict())
-        # logging.info("optim state_dict = ", optim.state_dict())
-        logging.info(f"input_magnitudes = {input_magnitudes}")
-        logging.info(f"output_magnitudes = {output_magnitudes}")
-
-
-if __name__ == "__main__":
-    torch.set_num_threads(1)
-    torch.set_num_interop_threads(1)
-    logging.getLogger().setLevel(logging.INFO)
-    import subprocess
-
-    s = subprocess.check_output(
-        "git status -uno .; git log -1; git diff HEAD .", shell=True
-    )
-    logging.info(s)
-    import sys
-
-    if len(sys.argv) > 1:
-        hidden_dim = int(sys.argv[1])
-    else:
-        hidden_dim = 200
-
-    _test_scaled_adam(hidden_dim)
-    _test_eden()
diff --git a/egs/librispeech/SSL/hubert/optim.py b/egs/librispeech/SSL/hubert/optim.py
new file mode 120000
index 000000000..56b827b8a
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/optim.py
@@ -0,0 +1 @@
+../../ASR/zipformer/optim.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/scaling.py b/egs/librispeech/SSL/hubert/scaling.py
deleted file mode 100644
index 29ac33c02..000000000
--- a/egs/librispeech/SSL/hubert/scaling.py
+++ /dev/null
@@ -1,1908 +0,0 @@
-# Copyright    2022-2023  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import math
-import random
-from typing import Optional, Tuple, Union
-
-import k2
-import torch
-import torch.nn as nn
-from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-
-def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
-    max_value = torch.max(x, y)
-    diff = torch.abs(x - y)
-    return max_value + torch.log1p(torch.exp(-diff))
-
-
-# RuntimeError: Exporting the operator logaddexp to ONNX opset version
-# 14 is not supported. Please feel free to request support or submit
-# a pull request on PyTorch GitHub.
-#
-# The following function is to solve the above error when exporting
-# models to ONNX via torch.jit.trace()
-def logaddexp(x: Tensor, y: Tensor) -> Tensor:
-    # Caution(fangjun): Put torch.jit.is_scripting() before
-    # torch.onnx.is_in_onnx_export();
-    # otherwise, it will cause errors for torch.jit.script().
-    #
-    # torch.logaddexp() works for both torch.jit.script() and
-    # torch.jit.trace() but it causes errors for ONNX export.
-    #
-    if torch.jit.is_scripting():
-        # Note: We cannot use torch.jit.is_tracing() here as it also
-        # matches torch.onnx.export().
-        return torch.logaddexp(x, y)
-    elif torch.onnx.is_in_onnx_export():
-        return logaddexp_onnx(x, y)
-    else:
-        # for torch.jit.trace()
-        return torch.logaddexp(x, y)
-
-
-class PiecewiseLinear(object):
-    """
-    Piecewise linear function, from float to float, specified as nonempty list of (x,y) pairs with
-    the x values in order.  x values <[initial x] or >[final x] are map to [initial y], [final y]
-    respectively.
-    """
-
-    def __init__(self, *args):
-        assert len(args) >= 1, len(args)
-        if len(args) == 1 and isinstance(args[0], PiecewiseLinear):
-            self.pairs = list(args[0].pairs)
-        else:
-            self.pairs = [(float(x), float(y)) for x, y in args]
-        for x, y in self.pairs:
-            assert isinstance(x, (float, int)), type(x)
-            assert isinstance(y, (float, int)), type(y)
-
-        for i in range(len(self.pairs) - 1):
-            assert self.pairs[i + 1][0] > self.pairs[i][0], (
-                i,
-                self.pairs[i],
-                self.pairs[i + 1],
-            )
-
-    def __str__(self):
-        # e.g. 'PiecewiseLinear((0., 10.), (100., 0.))'
-        return f"PiecewiseLinear({str(self.pairs)[1:-1]})"
-
-    def __call__(self, x):
-        if x <= self.pairs[0][0]:
-            return self.pairs[0][1]
-        elif x >= self.pairs[-1][0]:
-            return self.pairs[-1][1]
-        else:
-            cur_x, cur_y = self.pairs[0]
-            for i in range(1, len(self.pairs)):
-                next_x, next_y = self.pairs[i]
-                if x >= cur_x and x <= next_x:
-                    return cur_y + (next_y - cur_y) * (x - cur_x) / (next_x - cur_x)
-                cur_x, cur_y = next_x, next_y
-            assert False
-
-    def __mul__(self, alpha):
-        return PiecewiseLinear(*[(x, y * alpha) for x, y in self.pairs])
-
-    def __add__(self, x):
-        if isinstance(x, (float, int)):
-            return PiecewiseLinear(*[(p[0], p[1] + x) for p in self.pairs])
-        s, x = self.get_common_basis(x)
-        return PiecewiseLinear(
-            *[(sp[0], sp[1] + xp[1]) for sp, xp in zip(s.pairs, x.pairs)]
-        )
-
-    def max(self, x):
-        if isinstance(x, (float, int)):
-            x = PiecewiseLinear((0, x))
-        s, x = self.get_common_basis(x, include_crossings=True)
-        return PiecewiseLinear(
-            *[(sp[0], max(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
-        )
-
-    def min(self, x):
-        if isinstance(x, float) or isinstance(x, int):
-            x = PiecewiseLinear((0, x))
-        s, x = self.get_common_basis(x, include_crossings=True)
-        return PiecewiseLinear(
-            *[(sp[0], min(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
-        )
-
-    def __eq__(self, other):
-        return self.pairs == other.pairs
-
-    def get_common_basis(self, p: "PiecewiseLinear", include_crossings: bool = False):
-        """
-        Returns (self_mod, p_mod) which are equivalent piecewise linear
-        functions to self and p, but with the same x values.
-
-          p: the other piecewise linear function
-          include_crossings: if true, include in the x values positions
-              where the functions indicate by this and p crosss.
-        """
-        assert isinstance(p, PiecewiseLinear), type(p)
-
-        # get sorted x-values without repetition.
-        x_vals = sorted(set([x for x, _ in self.pairs] + [x for x, _ in p.pairs]))
-        y_vals1 = [self(x) for x in x_vals]
-        y_vals2 = [p(x) for x in x_vals]
-
-        if include_crossings:
-            extra_x_vals = []
-            for i in range(len(x_vals) - 1):
-                if (y_vals1[i] > y_vals2[i]) != (y_vals1[i + 1] > y_vals2[i + 1]):
-                    # if the two lines in this subsegment potentially cross each other..
-                    diff_cur = abs(y_vals1[i] - y_vals2[i])
-                    diff_next = abs(y_vals1[i + 1] - y_vals2[i + 1])
-                    # `pos`, between 0 and 1, gives the relative x position,
-                    # with 0 being x_vals[i] and 1 being x_vals[i+1].
-                    pos = diff_cur / (diff_cur + diff_next)
-                    extra_x_val = x_vals[i] + pos * (x_vals[i + 1] - x_vals[i])
-                    extra_x_vals.append(extra_x_val)
-            if len(extra_x_vals) > 0:
-                x_vals = sorted(set(x_vals + extra_x_vals))
-        y_vals1 = [self(x) for x in x_vals]
-        y_vals2 = [p(x) for x in x_vals]
-        return (
-            PiecewiseLinear(*zip(x_vals, y_vals1)),
-            PiecewiseLinear(*zip(x_vals, y_vals2)),
-        )
-
-
-class ScheduledFloat(torch.nn.Module):
-    """
-    This object is a torch.nn.Module only because we want it to show up in [top_level module].modules();
-    it does not have a working forward() function.  You are supposed to cast it to float, as
-    in, float(parent_module.whatever), and use it as something like a dropout prob.
-
-    It is a floating point value whose value changes depending on the batch count of the
-    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
-    in sorted order on x; x corresponds to the batch index.  For batch-index values before the
-    first x or after the last x, we just use the first or last y value.
-
-    Example:
-       self.dropout = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0.0)
-
-    `default` is used when self.batch_count is not set or not in training mode or in
-     torch.jit scripting mode.
-    """
-
-    def __init__(self, *args, default: float = 0.0):
-        super().__init__()
-        # self.batch_count and self.name will be written to in the training loop.
-        self.batch_count = None
-        self.name = None
-        self.default = default
-        self.schedule = PiecewiseLinear(*args)
-
-    def extra_repr(self) -> str:
-        return (
-            f"batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}"
-        )
-
-    def __float__(self):
-        batch_count = self.batch_count
-        if (
-            batch_count is None
-            or not self.training
-            or torch.jit.is_scripting()
-            or torch.jit.is_tracing()
-        ):
-            return float(self.default)
-        else:
-            ans = self.schedule(self.batch_count)
-            if random.random() < 0.0002:
-                logging.info(
-                    f"ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}"
-                )
-            return ans
-
-    def __add__(self, x):
-        if isinstance(x, float) or isinstance(x, int):
-            return ScheduledFloat(self.schedule + x, default=self.default)
-        else:
-            return ScheduledFloat(
-                self.schedule + x.schedule, default=self.default + x.default
-            )
-
-    def max(self, x):
-        if isinstance(x, float) or isinstance(x, int):
-            return ScheduledFloat(self.schedule.max(x), default=self.default)
-        else:
-            return ScheduledFloat(
-                self.schedule.max(x.schedule), default=max(self.default, x.default)
-            )
-
-
-FloatLike = Union[float, ScheduledFloat]
-
-
-def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor:
-    """
-    A randomized way of casting a floating point value to half precision.
-    """
-    if x.dtype == torch.float16:
-        return x
-    x_abs = x.abs()
-    is_too_small = x_abs < min_abs
-    # for elements where is_too_small is true, random_val will contain +-min_abs with
-    # probability (x.abs() / min_abs), and 0.0 otherwise.  [so this preserves expectations,
-    # for those elements].
-    random_val = min_abs * x.sign() * (torch.rand_like(x) * min_abs < x_abs)
-    return torch.where(is_too_small, random_val, x).to(torch.float16)
-
-
-class CutoffEstimator:
-    """
-    Estimates cutoffs of an arbitrary numerical quantity such that a specified
-    proportion of items will be above the cutoff on average.
-
-      p is the proportion of items that should be above the cutoff.
-    """
-
-    def __init__(self, p: float):
-        self.p = p
-        # total count of items
-        self.count = 0
-        # total count of items that were above the cutoff
-        self.count_above = 0
-        # initial cutoff value
-        self.cutoff = 0
-
-    def __call__(self, x: float) -> bool:
-        """
-        Returns true if x is above the cutoff.
-        """
-        ans = x > self.cutoff
-        self.count += 1
-        if ans:
-            self.count_above += 1
-        cur_p = self.count_above / self.count
-        delta_p = cur_p - self.p
-        if (delta_p > 0) == ans:
-            q = abs(delta_p)
-            self.cutoff = x * q + self.cutoff * (1 - q)
-        return ans
-
-
-class SoftmaxFunction(torch.autograd.Function):
-    """
-    Tries to handle half-precision derivatives in a randomized way that should
-    be more accurate for training than the default behavior.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor, dim: int):
-        ans = x.softmax(dim=dim)
-        # if x dtype is float16, x.softmax() returns a float32 because
-        # (presumably) that op does not support float16, and autocast
-        # is enabled.
-        if torch.is_autocast_enabled():
-            ans = ans.to(torch.float16)
-        ctx.save_for_backward(ans)
-        ctx.x_dtype = x.dtype
-        ctx.dim = dim
-        return ans
-
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor):
-        (ans,) = ctx.saved_tensors
-        with torch.cuda.amp.autocast(enabled=False):
-            ans_grad = ans_grad.to(torch.float32)
-            ans = ans.to(torch.float32)
-            x_grad = ans_grad * ans
-            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
-            return x_grad, None
-
-
-def softmax(x: Tensor, dim: int):
-    if not x.requires_grad or torch.jit.is_scripting() or torch.jit.is_tracing():
-        return x.softmax(dim=dim)
-
-    return SoftmaxFunction.apply(x, dim)
-
-
-class MaxEigLimiterFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        coeffs: Tensor,
-        direction: Tensor,
-        channel_dim: int,
-        grad_scale: float,
-    ) -> Tensor:
-        ctx.channel_dim = channel_dim
-        ctx.grad_scale = grad_scale
-        ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach())
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad, *args):
-        with torch.enable_grad():
-            (x_orig, coeffs, new_direction) = ctx.saved_tensors
-            x_orig.requires_grad = True
-            num_channels = x_orig.shape[ctx.channel_dim]
-            x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels)
-            new_direction.requires_grad = False
-            x = x - x.mean(dim=0)
-            x_var = (x**2).mean()
-            x_residual = x - coeffs * new_direction
-            x_residual_var = (x_residual**2).mean()
-            # `variance_proportion` is the proportion of the variance accounted for
-            # by the top eigen-direction.  This is to be minimized.
-            variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
-            variance_proportion.backward()
-        x_orig_grad = x_orig.grad
-        x_extra_grad = (
-            x_orig.grad
-            * ctx.grad_scale
-            * x_grad.norm()
-            / (x_orig_grad.norm() + 1.0e-20)
-        )
-        return x_grad + x_extra_grad.detach(), None, None, None, None
-
-
-class BiasNormFunction(torch.autograd.Function):
-    # This computes:
-    #   scales = (torch.mean((x - bias) ** 2, keepdim=True)) ** -0.5 * log_scale.exp()
-    #   return x * scales
-    # (after unsqueezing the bias), but it does it in a memory-efficient way so that
-    # it can just store the returned value (chances are, this will also be needed for
-    # some other reason, related to the next operation, so we can save memory).
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        bias: Tensor,
-        log_scale: Tensor,
-        channel_dim: int,
-        store_output_for_backprop: bool,
-    ) -> Tensor:
-        assert bias.ndim == 1
-        if channel_dim < 0:
-            channel_dim = channel_dim + x.ndim
-        ctx.store_output_for_backprop = store_output_for_backprop
-        ctx.channel_dim = channel_dim
-        for _ in range(channel_dim + 1, x.ndim):
-            bias = bias.unsqueeze(-1)
-        scales = (
-            torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
-        ) * log_scale.exp()
-        ans = x * scales
-        ctx.save_for_backward(
-            ans.detach() if store_output_for_backprop else x,
-            scales.detach(),
-            bias.detach(),
-            log_scale.detach(),
-        )
-        return ans
-
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor) -> Tensor:
-        ans_or_x, scales, bias, log_scale = ctx.saved_tensors
-        if ctx.store_output_for_backprop:
-            x = ans_or_x / scales
-        else:
-            x = ans_or_x
-        x = x.detach()
-        x.requires_grad = True
-        bias.requires_grad = True
-        log_scale.requires_grad = True
-        with torch.enable_grad():
-            # recompute scales from x, bias and log_scale.
-            scales = (
-                torch.mean((x - bias) ** 2, dim=ctx.channel_dim, keepdim=True) ** -0.5
-            ) * log_scale.exp()
-            ans = x * scales
-            ans.backward(gradient=ans_grad)
-        return x.grad, bias.grad.flatten(), log_scale.grad, None, None
-
-
-class BiasNorm(torch.nn.Module):
-    """
-    This is intended to be a simpler, and hopefully cheaper, replacement for
-    LayerNorm.  The observation this is based on, is that Transformer-type
-    networks, especially with pre-norm, sometimes seem to set one of the
-    feature dimensions to a large constant value (e.g. 50), which "defeats"
-    the LayerNorm because the output magnitude is then not strongly dependent
-    on the other (useful) features.  Presumably the weight and bias of the
-    LayerNorm are required to allow it to do this.
-
-    Instead, we give the BiasNorm a trainable bias that it can use when
-    computing the scale for normalization.  We also give it a (scalar)
-    trainable scale on the output.
-
-
-    Args:
-       num_channels: the number of channels, e.g. 512.
-       channel_dim: the axis/dimension corresponding to the channel,
-         interpreted as an offset from the input's ndim if negative.
-         This is NOT the num_channels; it should typically be one of
-         {-2, -1, 0, 1, 2, 3}.
-      log_scale: the initial log-scale that we multiply the output by; this
-         is learnable.
-      log_scale_min: FloatLike, minimum allowed value of log_scale
-      log_scale_max: FloatLike, maximum allowed value of log_scale
-      store_output_for_backprop: only possibly affects memory use; recommend
-         to set to True if you think the output of this module is more likely
-         than the input of this module to be required to be stored for the
-         backprop.
-    """
-
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int = -1,  # CAUTION: see documentation.
-        log_scale: float = 1.0,
-        log_scale_min: float = -1.5,
-        log_scale_max: float = 1.5,
-        store_output_for_backprop: bool = False,
-    ) -> None:
-        super(BiasNorm, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.log_scale = nn.Parameter(torch.tensor(log_scale))
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-
-        self.log_scale_min = log_scale_min
-        self.log_scale_max = log_scale_max
-
-        self.store_output_for_backprop = store_output_for_backprop
-
-    def forward(self, x: Tensor) -> Tensor:
-        assert x.shape[self.channel_dim] == self.num_channels
-
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            channel_dim = self.channel_dim
-            if channel_dim < 0:
-                channel_dim += x.ndim
-            bias = self.bias
-            for _ in range(channel_dim + 1, x.ndim):
-                bias = bias.unsqueeze(-1)
-            scales = (
-                torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
-            ) * self.log_scale.exp()
-            return x * scales
-
-        log_scale = limit_param_value(
-            self.log_scale,
-            min=float(self.log_scale_min),
-            max=float(self.log_scale_max),
-            training=self.training,
-        )
-
-        return BiasNormFunction.apply(
-            x, self.bias, log_scale, self.channel_dim, self.store_output_for_backprop
-        )
-
-
-def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
-    """
-    Behaves like a constructor of a modified version of nn.Linear
-    that gives an easy way to set the default initial parameter scale.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Linear(*args, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
-    return ans
-
-
-def ScaledConv1d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv1d:
-    """
-    Behaves like a constructor of a modified version of nn.Conv1d
-    that gives an easy way to set the default initial parameter scale.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Conv1d(*args, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
-    return ans
-
-
-def ScaledConv2d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv2d:
-    """
-    Behaves like a constructor of a modified version of nn.Conv2d
-    that gives an easy way to set the default initial parameter scale.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False, but:
-    NO PADDING-RELATED ARGS.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Conv2d(*args, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
-    return ans
-
-
-class ChunkCausalDepthwiseConv1d(torch.nn.Module):
-    """
-    Behaves like a depthwise 1d convolution, except that it is causal in
-    a chunkwise way, as if we had a block-triangular attention mask.
-    The chunk size is provided at test time (it should probably be
-    kept in sync with the attention mask).
-
-    This has a little more than twice the parameters of a conventional
-    depthwise conv1d module: we implement it by having one
-    depthwise convolution, of half the width, that is causal (via
-    right-padding); and one depthwise convolution that is applied only
-    within chunks, that we multiply by a scaling factor which depends
-    on the position within the chunk.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-
-    def __init__(
-        self,
-        channels: int,
-        kernel_size: int,
-        initial_scale: float = 1.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-        assert kernel_size % 2 == 1
-
-        half_kernel_size = (kernel_size + 1) // 2
-        # will pad manually, on one side.
-        self.causal_conv = nn.Conv1d(
-            in_channels=channels,
-            out_channels=channels,
-            groups=channels,
-            kernel_size=half_kernel_size,
-            padding=0,
-            bias=True,
-        )
-
-        self.chunkwise_conv = nn.Conv1d(
-            in_channels=channels,
-            out_channels=channels,
-            groups=channels,
-            kernel_size=kernel_size,
-            padding=kernel_size // 2,
-            bias=bias,
-        )
-
-        # first row is correction factors added to the scale near the left edge of the chunk,
-        # second row is correction factors added to the scale near the right edge of the chunk,
-        # both of these are added to a default scale of 1.0.
-        self.chunkwise_conv_scale = nn.Parameter(torch.zeros(2, channels, kernel_size))
-        self.kernel_size = kernel_size
-
-        with torch.no_grad():
-            self.causal_conv.weight[:] *= initial_scale
-            self.chunkwise_conv.weight[:] *= initial_scale
-            if bias:
-                torch.nn.init.uniform_(
-                    self.causal_conv.bias, -0.1 * initial_scale, 0.1 * initial_scale
-                )
-
-    def forward(self, x: Tensor, chunk_size: int = -1) -> Tensor:
-        """
-             Forward function.  Args:
-               x: a Tensor of shape (batch_size, channels, seq_len)
-        chunk_size: the chunk size, in frames; does not have to divide seq_len exactly.
-        """
-        (batch_size, num_channels, seq_len) = x.shape
-
-        # half_kernel_size = self.kernel_size + 1 // 2
-        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
-        # in the causal conv.  It's the amount by which we must pad on the left,
-        # to make the convolution causal.
-        left_pad = self.kernel_size // 2
-
-        if chunk_size < 0 or chunk_size > seq_len:
-            chunk_size = seq_len
-        right_pad = -seq_len % chunk_size
-
-        x = torch.nn.functional.pad(x, (left_pad, right_pad))
-
-        x_causal = self.causal_conv(x[..., : left_pad + seq_len])
-        assert x_causal.shape == (batch_size, num_channels, seq_len)
-
-        x_chunk = x[..., left_pad:]
-        num_chunks = x_chunk.shape[2] // chunk_size
-        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks, chunk_size)
-        x_chunk = x_chunk.permute(0, 2, 1, 3).reshape(
-            batch_size * num_chunks, num_channels, chunk_size
-        )
-        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
-
-        chunk_scale = self._get_chunk_scale(chunk_size)
-
-        x_chunk = x_chunk * chunk_scale
-        x_chunk = x_chunk.reshape(
-            batch_size, num_chunks, num_channels, chunk_size
-        ).permute(0, 2, 1, 3)
-        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks * chunk_size)[
-            ..., :seq_len
-        ]
-
-        return x_chunk + x_causal
-
-    def _get_chunk_scale(self, chunk_size: int):
-        """Returns tensor of shape (num_channels, chunk_size) that will be used to
-        scale the output of self.chunkwise_conv."""
-        left_edge = self.chunkwise_conv_scale[0]
-        right_edge = self.chunkwise_conv_scale[1]
-        if chunk_size < self.kernel_size:
-            left_edge = left_edge[:, :chunk_size]
-            right_edge = right_edge[:, -chunk_size:]
-        else:
-            t = chunk_size - self.kernel_size
-            channels = left_edge.shape[0]
-            pad = torch.zeros(
-                channels, t, device=left_edge.device, dtype=left_edge.dtype
-            )
-            left_edge = torch.cat((left_edge, pad), dim=-1)
-            right_edge = torch.cat((pad, right_edge), dim=-1)
-        return 1.0 + (left_edge + right_edge)
-
-    def streaming_forward(
-        self,
-        x: Tensor,
-        cache: Tensor,
-    ) -> Tuple[Tensor, Tensor]:
-        """Streaming Forward function.
-
-        Args:
-            x: a Tensor of shape (batch_size, channels, seq_len)
-            cache: cached left context of shape (batch_size, channels, left_pad)
-        """
-        (batch_size, num_channels, seq_len) = x.shape
-
-        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
-        # in the causal conv.  It's the amount by which we must pad on the left,
-        # to make the convolution causal.
-        left_pad = self.kernel_size // 2
-
-        # Pad cache
-        assert cache.shape[-1] == left_pad, (cache.shape[-1], left_pad)
-        x = torch.cat([cache, x], dim=2)
-        # Update cache
-        cache = x[..., -left_pad:]
-
-        x_causal = self.causal_conv(x)
-        assert x_causal.shape == (batch_size, num_channels, seq_len)
-
-        x_chunk = x[..., left_pad:]
-        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
-
-        chunk_scale = self._get_chunk_scale(chunk_size=seq_len)
-        x_chunk = x_chunk * chunk_scale
-
-        return x_chunk + x_causal, cache
-
-
-class BalancerFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        min_mean: float,
-        max_mean: float,
-        min_rms: float,
-        max_rms: float,
-        grad_scale: float,
-        channel_dim: int,
-    ) -> Tensor:
-        if channel_dim < 0:
-            channel_dim += x.ndim
-        ctx.channel_dim = channel_dim
-        ctx.save_for_backward(x)
-        ctx.config = (min_mean, max_mean, min_rms, max_rms, grad_scale, channel_dim)
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None, None, None]:
-        (x,) = ctx.saved_tensors
-        (min_mean, max_mean, min_rms, max_rms, grad_scale, channel_dim) = ctx.config
-
-        try:
-            with torch.enable_grad():
-                with torch.cuda.amp.autocast(enabled=False):
-                    x = x.to(torch.float32)
-                    x = x.detach()
-                    x.requires_grad = True
-                    mean_dims = [i for i in range(x.ndim) if i != channel_dim]
-                    uncentered_var = (x**2).mean(dim=mean_dims, keepdim=True)
-                    mean = x.mean(dim=mean_dims, keepdim=True)
-                    stddev = (uncentered_var - (mean * mean)).clamp(min=1.0e-20).sqrt()
-                    rms = uncentered_var.clamp(min=1.0e-20).sqrt()
-
-                    m = mean / stddev
-                    # part of loss that relates to mean / stddev
-                    m_loss = (m - m.clamp(min=min_mean, max=max_mean)).abs()
-
-                    # put a much larger scale on the RMS-max-limit loss, so that if both it and the
-                    # m_loss are violated we fix the RMS loss first.
-                    rms_clamped = rms.clamp(min=min_rms, max=max_rms)
-                    r_loss = (rms_clamped / rms).log().abs()
-
-                    loss = m_loss + r_loss
-
-                    loss.backward(gradient=torch.ones_like(loss))
-                    loss_grad = x.grad
-                    loss_grad_rms = (
-                        (loss_grad**2)
-                        .mean(dim=mean_dims, keepdim=True)
-                        .sqrt()
-                        .clamp(min=1.0e-20)
-                    )
-
-                    loss_grad = loss_grad * (grad_scale / loss_grad_rms)
-
-                    x_grad_float = x_grad.to(torch.float32)
-                    # scale each element of loss_grad by the absolute value of the corresponding
-                    # element of x_grad, which we view as a noisy estimate of its magnitude for that
-                    # (frame and dimension).  later we can consider factored versions.
-                    x_grad_mod = x_grad_float + (x_grad_float.abs() * loss_grad)
-                    x_grad = x_grad_mod.to(x_grad.dtype)
-        except Exception as e:
-            logging.info(
-                f"Caught exception in Balancer backward: {e}, size={list(x_grad.shape)}, will continue."
-            )
-
-        return x_grad, None, None, None, None, None, None
-
-
-class Balancer(torch.nn.Module):
-    """
-    Modifies the backpropped derivatives of a function to try to encourage, for
-    each channel, that it is positive at least a proportion `threshold` of the
-    time.  It does this by multiplying negative derivative values by up to
-    (1+max_factor), and positive derivative values by up to (1-max_factor),
-    interpolated from 1 at the threshold to those extremal values when none
-    of the inputs are positive.
-
-    Args:
-           num_channels: the number of channels
-           channel_dim: the dimension/axis corresponding to the channel, e.g.
-               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
-           min_positive: the minimum, per channel, of the proportion of the time
-               that (x > 0), below which we start to modify the derivatives.
-           max_positive: the maximum, per channel, of the proportion of the time
-               that (x > 0), above which we start to modify the derivatives.
-           scale_gain_factor: determines the 'gain' with which we increase the
-              change in gradient once the constraints on min_abs and max_abs
-              are violated.
-           min_abs:  the minimum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-           max_abs:  the maximum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-         prob: determines the minimum probability with which we modify the
-             gradients for the {min,max}_positive and {min,max}_abs constraints,
-             on each forward().  This is done randomly to prevent all layers
-             from doing it at the same time.
-    """
-
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int,
-        min_positive: FloatLike = 0.05,
-        max_positive: FloatLike = 0.95,
-        min_abs: FloatLike = 0.2,
-        max_abs: FloatLike = 100.0,
-        grad_scale: FloatLike = 0.04,
-        prob: Optional[FloatLike] = None,
-    ):
-        super().__init__()
-
-        if prob is None:
-            prob = ScheduledFloat((0.0, 0.5), (8000.0, 0.125), default=0.4)
-        self.prob = prob
-        # 5% of the time we will return and do nothing because memory usage is
-        # too high.
-        self.mem_cutoff = CutoffEstimator(0.05)
-
-        # actually self.num_channels is no longer needed except for an assertion.
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.min_positive = min_positive
-        self.max_positive = max_positive
-        self.min_abs = min_abs
-        self.max_abs = max_abs
-        self.grad_scale = grad_scale
-
-    def forward(self, x: Tensor) -> Tensor:
-        if (
-            torch.jit.is_scripting()
-            or not x.requires_grad
-            or (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated()))
-        ):
-            return _no_op(x)
-
-        prob = float(self.prob)
-        if random.random() < prob:
-            # The following inner-functions convert from the way we historically specified
-            # these limitations, as limits on the absolute value and the proportion of positive
-            # values, to limits on the RMS value and the (mean / stddev).
-            def _abs_to_rms(x):
-                # for normally distributed data, if the expected absolute value is x, the
-                # expected rms value will be sqrt(pi/2) * x.
-                return 1.25331413732 * x
-
-            def _proportion_positive_to_mean(x):
-                def _atanh(x):
-                    eps = 1.0e-10
-                    # eps is to prevent crashes if x is exactly 0 or 1.
-                    # we'll just end up returning a fairly large value.
-                    return (math.log(1 + x + eps) - math.log(1 - x + eps)) / 2.0
-
-                def _approx_inverse_erf(x):
-                    # 1 / (sqrt(pi) * ln(2)),
-                    # see https://math.stackexchange.com/questions/321569/approximating-the-error-function-erf-by-analytical-functions
-                    # this approximation is extremely crude and gets progressively worse for
-                    # x very close to -1 or +1, but we mostly care about the "middle" region
-                    # e.g. _approx_inverse_erf(0.05) = 0.0407316414078772,
-                    # and math.erf(0.0407316414078772) = 0.045935330944660666,
-                    # which is pretty close to 0.05.
-                    return 0.8139535143 * _atanh(x)
-
-                # first convert x from the range 0..1 to the range -1..1 which the error
-                # function returns
-                x = -1 + (2 * x)
-                return _approx_inverse_erf(x)
-
-            min_mean = _proportion_positive_to_mean(float(self.min_positive))
-            max_mean = _proportion_positive_to_mean(float(self.max_positive))
-            min_rms = _abs_to_rms(float(self.min_abs))
-            max_rms = _abs_to_rms(float(self.max_abs))
-            grad_scale = float(self.grad_scale)
-
-            assert x.shape[self.channel_dim] == self.num_channels
-
-            return BalancerFunction.apply(
-                x, min_mean, max_mean, min_rms, max_rms, grad_scale, self.channel_dim
-            )
-        else:
-            return _no_op(x)
-
-
-def penalize_abs_values_gt(
-    x: Tensor, limit: float, penalty: float, name: str = None
-) -> Tensor:
-    """
-    Returns x unmodified, but in backprop will put a penalty for the excess of
-    the absolute values of elements of x over the limit "limit".  E.g. if
-    limit == 10.0, then if x has any values over 10 it will get a penalty.
-
-    Caution: the value of this penalty will be affected by grad scaling used
-    in automatic mixed precision training.  For this reasons we use this,
-    it shouldn't really matter, or may even be helpful; we just use this
-    to disallow really implausible values of scores to be given to softmax.
-
-    The name is for randomly printed debug info.
-    """
-    x_sign = x.sign()
-    over_limit = (x.abs() - limit) > 0
-    # The following is a memory efficient way to penalize the absolute values of
-    # x that's over the limit.  (The memory efficiency comes when you think
-    # about which items torch needs to cache for the autograd, and which ones it
-    # can throw away).  The numerical value of aux_loss as computed here will
-    # actually be larger than it should be, by limit * over_limit.sum(), but it
-    # has the same derivative as the real aux_loss which is penalty * (x.abs() -
-    # limit).relu().
-    aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x)
-    # note: we don't do sum() here on aux)_loss, but it's as if we had done
-    # sum() due to how with_loss() works.
-    x = with_loss(x, aux_loss, name)
-    # you must use x for something, or this will be ineffective.
-    return x
-
-
-def _diag(x: Tensor):  # like .diag(), but works for tensors with 3 dims.
-    if x.ndim == 2:
-        return x.diag()
-    else:
-        (batch, dim, dim) = x.shape
-        x = x.reshape(batch, dim * dim)
-        x = x[:, :: dim + 1]
-        assert x.shape == (batch, dim)
-        return x
-
-
-def _whitening_metric(x: Tensor, num_groups: int):
-    """
-    Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
-    of the centered feature covariance are the same within each group's covariance matrix
-    and also between groups.
-    Args:
-        x: a Tensor of shape (*, num_channels)
-     num_groups:  the number of groups of channels, a number >=1 that divides num_channels
-    Returns:
-        Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and
-    greater than 1.0 otherwise.
-    """
-    assert x.dtype != torch.float16
-    x = x.reshape(-1, x.shape[-1])
-    (num_frames, num_channels) = x.shape
-    assert num_channels % num_groups == 0
-    channels_per_group = num_channels // num_groups
-    x = x.reshape(num_frames, num_groups, channels_per_group).transpose(0, 1)
-    # x now has shape (num_groups, num_frames, channels_per_group)
-    # subtract the mean so we use the centered, not uncentered, covariance.
-    # My experience has been that when we "mess with the gradients" like this,
-    # it's better not do anything that tries to move the mean around, because
-    # that can easily cause instability.
-    x = x - x.mean(dim=1, keepdim=True)
-    # x_covar: (num_groups, channels_per_group, channels_per_group)
-    x_covar = torch.matmul(x.transpose(1, 2), x)
-    x_covar_mean_diag = _diag(x_covar).mean()
-    # the following expression is what we'd get if we took the matrix product
-    # of each covariance and measured the mean of its trace, i.e.
-    # the same as _diag(torch.matmul(x_covar, x_covar)).mean().
-    x_covarsq_mean_diag = (x_covar**2).sum() / (num_groups * channels_per_group)
-    # this metric will be >= 1.0; the larger it is, the less 'white' the data was.
-    metric = x_covarsq_mean_diag / (x_covar_mean_diag**2 + 1.0e-20)
-    return metric
-
-
-class WhiteningPenaltyFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, module: nn.Module) -> Tensor:
-        ctx.save_for_backward(x)
-        ctx.module = module
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad: Tensor):
-        (x_orig,) = ctx.saved_tensors
-        w = ctx.module
-
-        try:
-            with torch.enable_grad():
-                with torch.cuda.amp.autocast(enabled=False):
-                    x_detached = x_orig.to(torch.float32).detach()
-                    x_detached.requires_grad = True
-
-                    metric = _whitening_metric(x_detached, w.num_groups)
-
-                    if random.random() < 0.005 or __name__ == "__main__":
-                        logging.info(
-                            f"Whitening: name={w.name}, num_groups={w.num_groups}, num_channels={x_orig.shape[-1]}, "
-                            f"metric={metric.item():.2f} vs. limit={float(w.whitening_limit)}"
-                        )
-
-                    if metric < float(w.whitening_limit):
-                        w.prob = w.min_prob
-                        return x_grad, None
-                    else:
-                        w.prob = w.max_prob
-                        metric.backward()
-                        penalty_grad = x_detached.grad
-                        scale = w.grad_scale * (
-                            x_grad.to(torch.float32).norm()
-                            / (penalty_grad.norm() + 1.0e-20)
-                        )
-                        penalty_grad = penalty_grad * scale
-                        return x_grad + penalty_grad.to(x_grad.dtype), None
-        except Exception as e:
-            logging.info(
-                f"Caught exception in Whiten backward: {e}, size={list(x_grad.shape)}, will continue."
-            )
-        return x_grad, None
-
-
-class Whiten(nn.Module):
-    def __init__(
-        self,
-        num_groups: int,
-        whitening_limit: FloatLike,
-        prob: Union[float, Tuple[float, float]],
-        grad_scale: FloatLike,
-    ):
-        """
-        Args:
-          num_groups: the number of groups to divide the channel dim into before
-            whitening.  We will attempt to make the feature covariance
-            within each group, after mean subtraction, as "white" as possible,
-            while having the same trace across all groups.
-         whitening_limit: a value greater than 1.0, that dictates how much
-           freedom we have to violate the constraints.  1.0 would mean perfectly
-           white, with exactly the same trace across groups; larger values
-           give more freedom.  E.g. 2.0.
-         prob: the probability with which we apply the gradient modification
-           (also affects the grad scale).  May be supplied as a float,
-           or as a pair (min_prob, max_prob)
-
-          grad_scale: determines the scale on the gradient term from this object,
-            relative to the rest of the gradient on the attention weights.
-            E.g. 0.02 (you may want to use smaller values than this if prob is large)
-        """
-        super(Whiten, self).__init__()
-        assert num_groups >= 1
-        assert float(whitening_limit) >= 1
-        assert grad_scale >= 0
-        self.num_groups = num_groups
-        self.whitening_limit = whitening_limit
-        self.grad_scale = grad_scale
-
-        if isinstance(prob, float):
-            prob = (prob, prob)
-        (self.min_prob, self.max_prob) = prob
-        assert 0 < self.min_prob <= self.max_prob <= 1
-        self.prob = self.max_prob
-        self.name = None  # will be set in training loop
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        In the forward pass, this function just returns the input unmodified.
-        In the backward pass, it will modify the gradients to ensure that the
-        distribution in each group has close to (lambda times I) as the covariance
-        after mean subtraction, with the same lambda across groups.
-        For whitening_limit > 1, there will be more freedom to violate this
-        constraint.
-
-        Args:
-           x: the input of shape (*, num_channels)
-
-        Returns:
-            x, unmodified.   You should make sure
-        you use the returned value, or the graph will be freed
-        and nothing will happen in backprop.
-        """
-        grad_scale = float(self.grad_scale)
-        if not x.requires_grad or random.random() > self.prob or grad_scale == 0:
-            return _no_op(x)
-        else:
-            return WhiteningPenaltyFunction.apply(x, self)
-
-
-class WithLoss(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, y: Tensor, name: str):
-        ctx.y_shape = y.shape
-        if random.random() < 0.002 and name is not None:
-            loss_sum = y.sum().item()
-            logging.info(f"WithLoss: name={name}, loss-sum={loss_sum:.3e}")
-        return x
-
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor):
-        return (
-            ans_grad,
-            torch.ones(ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device),
-            None,
-        )
-
-
-def with_loss(x, y, name):
-    # returns x but adds y.sum() to the loss function.
-    return WithLoss.apply(x, y, name)
-
-
-class ScaleGradFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, alpha: float) -> Tensor:
-        ctx.alpha = alpha
-        return x
-
-    @staticmethod
-    def backward(ctx, grad: Tensor):
-        return grad * ctx.alpha, None
-
-
-def scale_grad(x: Tensor, alpha: float):
-    return ScaleGradFunction.apply(x, alpha)
-
-
-class ScaleGrad(nn.Module):
-    def __init__(self, alpha: float):
-        super().__init__()
-        self.alpha = alpha
-
-    def forward(self, x: Tensor) -> Tensor:
-        if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
-            return x
-        return scale_grad(x, self.alpha)
-
-
-class LimitParamValue(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, min: float, max: float):
-        ctx.save_for_backward(x)
-        assert max >= min
-        ctx.min = min
-        ctx.max = max
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad: Tensor):
-        (x,) = ctx.saved_tensors
-        # where x < ctx.min, ensure all grads are negative (this will tend to make
-        # x more positive).
-        x_grad = x_grad * torch.where(
-            torch.logical_and(x_grad > 0, x < ctx.min), -1.0, 1.0
-        )
-        # where x > ctx.max, ensure all grads are positive (this will tend to make
-        # x more negative).
-        x_grad *= torch.where(torch.logical_and(x_grad < 0, x > ctx.max), -1.0, 1.0)
-        return x_grad, None, None
-
-
-def limit_param_value(
-    x: Tensor, min: float, max: float, prob: float = 0.6, training: bool = True
-):
-    # You apply this to (typically) an nn.Parameter during training to ensure that its
-    # (elements mostly) stays within a supplied range.  This is done by modifying the
-    # gradients in backprop.
-    # It's not necessary to do this on every batch: do it only some of the time,
-    # to save a little time.
-    if training and random.random() < prob:
-        return LimitParamValue.apply(x, min, max)
-    else:
-        return x
-
-
-def _no_op(x: Tensor) -> Tensor:
-    if torch.jit.is_scripting() or torch.jit.is_tracing():
-        return x
-    else:
-        # a no-op function that will have a node in the autograd graph,
-        # to avoid certain bugs relating to backward hooks
-        return x.chunk(1, dim=-1)[0]
-
-
-class Identity(torch.nn.Module):
-    def __init__(self):
-        super(Identity, self).__init__()
-
-    def forward(self, x):
-        return _no_op(x)
-
-
-class DoubleSwishFunction(torch.autograd.Function):
-    """
-      double_swish(x) = x * torch.sigmoid(x-1)
-
-    This is a definition, originally motivated by its close numerical
-    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
-
-    Memory-efficient derivative computation:
-     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
-     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
-     Now, s'(x) = s(x) * (1-s(x)).
-     double_swish'(x) =  x * s'(x) + s(x).
-                      =  x * s(x) * (1-s(x)) + s(x).
-                     = double_swish(x) * (1-s(x)) + s(x)
-     ... so we just need to remember s(x) but not x itself.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-
-        s = torch.sigmoid(x - 1.0)
-        y = x * s
-
-        if requires_grad:
-            deriv = y * (1 - s) + s
-
-            # notes on derivative of x * sigmoid(x - 1):
-            # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
-            # min \simeq -0.043638.  Take floor as -0.044 so it's a lower bund
-            # max \simeq 1.1990.   Take ceil to be 1.2 so it's an upper bound.
-            # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which
-            # floors), should be expectation-preserving.
-            floor = -0.044
-            ceil = 1.2
-            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
-                deriv
-            )
-            if __name__ == "__main__":
-                # for self-testing only.
-                assert d_scaled.min() >= 0.0
-                assert d_scaled.max() < 256.0
-            d_int = d_scaled.to(torch.uint8)
-            ctx.save_for_backward(d_int)
-        if x.dtype == torch.float16 or torch.is_autocast_enabled():
-            y = y.to(torch.float16)
-        return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d,) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-        floor = -0.043637
-        ceil = 1.2
-
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-
-
-class DoubleSwish(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
-        that we approximate closely with x * sigmoid(x-1).
-        """
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            return x * torch.sigmoid(x - 1.0)
-        return DoubleSwishFunction.apply(x)
-
-
-# Dropout2 is just like normal dropout, except it supports schedules on the dropout rates.
-class Dropout2(nn.Module):
-    def __init__(self, p: FloatLike):
-        super().__init__()
-        self.p = p
-
-    def forward(self, x: Tensor) -> Tensor:
-        return torch.nn.functional.dropout(x, p=float(self.p), training=self.training)
-
-
-class MulForDropout3(torch.autograd.Function):
-    # returns (x * y * alpha) where alpha is a float and y doesn't require
-    # grad and is zero-or-one.
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, y, alpha):
-        assert not y.requires_grad
-        ans = x * y * alpha
-        ctx.save_for_backward(ans)
-        ctx.alpha = alpha
-        return ans
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, ans_grad):
-        (ans,) = ctx.saved_tensors
-        x_grad = ctx.alpha * ans_grad * (ans != 0)
-        return x_grad, None, None
-
-
-# Dropout3 is just like normal dropout, except it supports schedules on the dropout rates,
-# and it lets you choose one dimension to share the dropout mask over
-class Dropout3(nn.Module):
-    def __init__(self, p: FloatLike, shared_dim: int):
-        super().__init__()
-        self.p = p
-        self.shared_dim = shared_dim
-
-    def forward(self, x: Tensor) -> Tensor:
-        p = float(self.p)
-        if not self.training or p == 0:
-            return _no_op(x)
-        scale = 1.0 / (1 - p)
-        rand_shape = list(x.shape)
-        rand_shape[self.shared_dim] = 1
-        mask = torch.rand(*rand_shape, device=x.device) > p
-        ans = MulForDropout3.apply(x, mask, scale)
-        return ans
-
-
-class SwooshLFunction(torch.autograd.Function):
-    """
-    swoosh_l(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-
-        coeff = -0.08
-
-        with torch.cuda.amp.autocast(enabled=False):
-            with torch.enable_grad():
-                x = x.detach()
-                x.requires_grad = True
-                y = torch.logaddexp(zero, x - 4.0) + coeff * x - 0.035
-
-                if not requires_grad:
-                    return y
-
-                y.backward(gradient=torch.ones_like(y))
-
-                grad = x.grad
-                floor = coeff
-                ceil = 1.0 + coeff + 0.005
-
-                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
-                    grad
-                )
-                if __name__ == "__main__":
-                    # for self-testing only.
-                    assert d_scaled.min() >= 0.0
-                    assert d_scaled.max() < 256.0
-
-                d_int = d_scaled.to(torch.uint8)
-                ctx.save_for_backward(d_int)
-                if x.dtype == torch.float16 or torch.is_autocast_enabled():
-                    y = y.to(torch.float16)
-                return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d,) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-
-        coeff = -0.08
-        floor = coeff
-        ceil = 1.0 + coeff + 0.005
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-
-
-class SwooshL(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-L activation."""
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-            return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035
-        if not x.requires_grad:
-            return k2.swoosh_l_forward(x)
-        else:
-            return k2.swoosh_l(x)
-        # return SwooshLFunction.apply(x)
-
-
-class SwooshLOnnx(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-L activation."""
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-        return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035
-
-
-class SwooshRFunction(torch.autograd.Function):
-    """
-     swoosh_r(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
-
-    derivatives are between -0.08 and 0.92.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            with torch.enable_grad():
-                x = x.detach()
-                x.requires_grad = True
-                y = torch.logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
-
-                if not requires_grad:
-                    return y
-                y.backward(gradient=torch.ones_like(y))
-
-                grad = x.grad
-                floor = -0.08
-                ceil = 0.925
-
-                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
-                    grad
-                )
-                if __name__ == "__main__":
-                    # for self-testing only.
-                    assert d_scaled.min() >= 0.0
-                    assert d_scaled.max() < 256.0
-
-                d_int = d_scaled.to(torch.uint8)
-                ctx.save_for_backward(d_int)
-                if x.dtype == torch.float16 or torch.is_autocast_enabled():
-                    y = y.to(torch.float16)
-                return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d,) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-        floor = -0.08
-        ceil = 0.925
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-
-
-class SwooshR(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-R activation."""
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-            return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
-        if not x.requires_grad:
-            return k2.swoosh_r_forward(x)
-        else:
-            return k2.swoosh_r(x)
-        # return SwooshRFunction.apply(x)
-
-
-class SwooshROnnx(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-R activation."""
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-        return logaddexp_onnx(zero, x - 1.0) - 0.08 * x - 0.313261687
-
-
-# simple version of SwooshL that does not redefine the backprop, used in
-# ActivationDropoutAndLinearFunction.
-def SwooshLForward(x: Tensor):
-    x_offset = x - 4.0
-    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
-    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
-    return log_sum - 0.08 * x - 0.035
-
-
-# simple version of SwooshR that does not redefine the backprop, used in
-# ActivationDropoutAndLinearFunction.
-def SwooshRForward(x: Tensor):
-    x_offset = x - 1.0
-    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
-    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
-    return log_sum - 0.08 * x - 0.313261687
-
-
-class ActivationDropoutAndLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        x: Tensor,
-        weight: Tensor,
-        bias: Optional[Tensor],
-        activation: str,
-        dropout_p: float,
-        dropout_shared_dim: Optional[int],
-    ):
-        if dropout_p != 0.0:
-            dropout_shape = list(x.shape)
-            if dropout_shared_dim is not None:
-                dropout_shape[dropout_shared_dim] = 1
-            # else it won't be very memory efficient.
-            dropout_mask = (1.0 / (1.0 - dropout_p)) * (
-                torch.rand(*dropout_shape, device=x.device, dtype=x.dtype) > dropout_p
-            )
-        else:
-            dropout_mask = None
-
-        ctx.save_for_backward(x, weight, bias, dropout_mask)
-
-        ctx.activation = activation
-
-        forward_activation_dict = {
-            "SwooshL": k2.swoosh_l_forward,
-            "SwooshR": k2.swoosh_r_forward,
-        }
-        # it will raise a KeyError if this fails.  This will be an error.  We let it
-        # propagate to the user.
-        activation_func = forward_activation_dict[activation]
-        x = activation_func(x)
-        if dropout_mask is not None:
-            x = x * dropout_mask
-        x = torch.nn.functional.linear(x, weight, bias)
-        return x
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, ans_grad: Tensor):
-        saved = ctx.saved_tensors
-        (x, weight, bias, dropout_mask) = saved
-
-        forward_and_deriv_activation_dict = {
-            "SwooshL": k2.swoosh_l_forward_and_deriv,
-            "SwooshR": k2.swoosh_r_forward_and_deriv,
-        }
-        # the following lines a KeyError if the activation is unrecognized.
-        # This will be an error.  We let it propagate to the user.
-        func = forward_and_deriv_activation_dict[ctx.activation]
-
-        y, func_deriv = func(x)
-        if dropout_mask is not None:
-            y = y * dropout_mask
-        # now compute derivative of y w.r.t. weight and bias..
-        # y: (..., in_channels), ans_grad: (..., out_channels),
-        (out_channels, in_channels) = weight.shape
-
-        in_channels = y.shape[-1]
-        g = ans_grad.reshape(-1, out_channels)
-        weight_deriv = torch.matmul(g.t(), y.reshape(-1, in_channels))
-        y_deriv = torch.matmul(ans_grad, weight)
-        bias_deriv = None if bias is None else g.sum(dim=0)
-        x_deriv = y_deriv * func_deriv
-        if dropout_mask is not None:
-            # order versus func_deriv does not matter
-            x_deriv = x_deriv * dropout_mask
-
-        return x_deriv, weight_deriv, bias_deriv, None, None, None
-
-
-class ActivationDropoutAndLinear(torch.nn.Module):
-    """
-     This merges an activation function followed by dropout and then a nn.Linear module;
-     it does so in a memory efficient way so that it only stores the input to the whole
-     module.  If activation == SwooshL and dropout_shared_dim != None, this will be
-     equivalent to:
-       nn.Sequential(SwooshL(),
-                     Dropout3(dropout_p, shared_dim=dropout_shared_dim),
-                     ScaledLinear(in_channels, out_channels, bias=bias,
-                                  initial_scale=initial_scale))
-    If dropout_shared_dim is None, the dropout would be equivalent to
-    Dropout2(dropout_p).  Note: Dropout3 will be more memory efficient as the dropout
-    mask is smaller.
-
-     Args:
-        in_channels: number of input channels, e.g. 256
-        out_channels: number of output channels, e.g. 256
-        bias: if true, have a bias
-        activation: the activation function, for now just support SwooshL.
-        dropout_p: the dropout probability or schedule (happens after nonlinearity).
-        dropout_shared_dim: the dimension, if any, across which the dropout mask is
-             shared (e.g. the time dimension).  If None, this may be less memory
-             efficient if there are modules before this one that cache the input
-             for their backprop (e.g. Balancer or Whiten).
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        bias: bool = True,
-        activation: str = "SwooshL",
-        dropout_p: FloatLike = 0.0,
-        dropout_shared_dim: Optional[int] = -1,
-        initial_scale: float = 1.0,
-    ):
-        super().__init__()
-        # create a temporary module of nn.Linear that we'll steal the
-        # weights and bias from
-        l = ScaledLinear(
-            in_channels, out_channels, bias=bias, initial_scale=initial_scale
-        )
-
-        self.weight = l.weight
-        # register_parameter properly handles making it a parameter when l.bias
-        # is None. I think there is some reason for doing it this way rather
-        # than just setting it to None but I don't know what it is, maybe
-        # something to do with exporting the module..
-        self.register_parameter("bias", l.bias)
-
-        self.activation = activation
-        self.dropout_p = dropout_p
-        self.dropout_shared_dim = dropout_shared_dim
-
-    def forward(self, x: Tensor):
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            if self.activation == "SwooshL":
-                x = SwooshLForward(x)
-            elif self.activation == "SwooshR":
-                x = SwooshRForward(x)
-            else:
-                assert False, self.activation
-            return torch.nn.functional.linear(x, self.weight, self.bias)
-
-        return ActivationDropoutAndLinearFunction.apply(
-            x,
-            self.weight,
-            self.bias,
-            self.activation,
-            float(self.dropout_p),
-            self.dropout_shared_dim,
-        )
-
-
-def convert_num_channels(x: Tensor, num_channels: int) -> Tensor:
-    if num_channels <= x.shape[-1]:
-        return x[..., :num_channels]
-    else:
-        shape = list(x.shape)
-        shape[-1] = num_channels - shape[-1]
-        zeros = torch.zeros(shape, dtype=x.dtype, device=x.device)
-        return torch.cat((x, zeros), dim=-1)
-
-
-def _test_whiten():
-    for proportion in [0.1, 0.5, 10.0]:
-        logging.info(f"_test_whiten(): proportion = {proportion}")
-        x = torch.randn(100, 128)
-        direction = torch.randn(128)
-        coeffs = torch.randn(100, 1)
-        x += proportion * direction * coeffs
-
-        x.requires_grad = True
-
-        m = Whiten(
-            1, 5.0, prob=1.0, grad_scale=0.1  # num_groups  # whitening_limit,
-        )  # grad_scale
-
-        for _ in range(4):
-            y = m(x)
-
-        y_grad = torch.randn_like(x)
-        y.backward(gradient=y_grad)
-
-        if proportion < 0.2:
-            assert torch.allclose(x.grad, y_grad)
-        elif proportion > 1.0:
-            assert not torch.allclose(x.grad, y_grad)
-
-
-def _test_balancer_sign():
-    probs = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = 1.0 * ((2.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))) - 1.0)
-    x = x.detach()
-    x.requires_grad = True
-    m = Balancer(
-        probs.numel(),
-        channel_dim=0,
-        min_positive=0.05,
-        max_positive=0.95,
-        min_abs=0.0,
-        prob=1.0,
-    )
-
-    y_grad = torch.sign(torch.randn(probs.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_balancer_sign: x = ", x)
-    print("_test_balancer_sign: y grad = ", y_grad)
-    print("_test_balancer_sign: x grad = ", x.grad)
-
-
-def _test_balancer_magnitude():
-    magnitudes = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(-1)
-    x = x.detach()
-    x.requires_grad = True
-    m = Balancer(
-        magnitudes.numel(),
-        channel_dim=0,
-        min_positive=0.0,
-        max_positive=1.0,
-        min_abs=0.2,
-        max_abs=0.7,
-        prob=1.0,
-    )
-
-    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_balancer_magnitude: x = ", x)
-    print("_test_balancer_magnitude: y grad = ", y_grad)
-    print("_test_balancer_magnitude: x grad = ", x.grad)
-
-
-def _test_double_swish_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    m = DoubleSwish()
-
-    tol = (1.2 - (-0.043637)) / 255.0
-    torch.autograd.gradcheck(m, x, atol=tol)
-
-    # for self-test.
-    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    y = m(x)
-
-
-def _test_swooshl_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    m = SwooshL()
-
-    tol = 1.0 / 255.0
-    torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
-
-    # for self-test.
-    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    y = m(x)
-
-
-def _test_swooshr_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    m = SwooshR()
-
-    tol = 1.0 / 255.0
-    torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
-
-    # for self-test.
-    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    y = m(x)
-
-
-def _test_softmax():
-    a = torch.randn(2, 10, dtype=torch.float64)
-    b = a.clone()
-    a.requires_grad = True
-    b.requires_grad = True
-    a.softmax(dim=1)[:, 0].sum().backward()
-    print("a grad = ", a.grad)
-    softmax(b, dim=1)[:, 0].sum().backward()
-    print("b grad = ", b.grad)
-    assert torch.allclose(a.grad, b.grad)
-
-
-def _test_piecewise_linear():
-    p = PiecewiseLinear((0, 10.0))
-    for x in [-100, 0, 100]:
-        assert p(x) == 10.0
-    p = PiecewiseLinear((0, 10.0), (1, 0.0))
-    for x, y in [(-100, 10.0), (0, 10.0), (0.5, 5.0), (1, 0.0), (2, 0.0)]:
-        print("x, y = ", x, y)
-        assert p(x) == y, (x, p(x), y)
-
-    q = PiecewiseLinear((0.5, 15.0), (0.6, 1.0))
-    x_vals = [-1.0, 0.0, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.0, 2.0]
-    pq = p.max(q)
-    for x in x_vals:
-        y1 = max(p(x), q(x))
-        y2 = pq(x)
-        assert abs(y1 - y2) < 0.001
-    pq = p.min(q)
-    for x in x_vals:
-        y1 = min(p(x), q(x))
-        y2 = pq(x)
-        assert abs(y1 - y2) < 0.001
-    pq = p + q
-    for x in x_vals:
-        y1 = p(x) + q(x)
-        y2 = pq(x)
-        assert abs(y1 - y2) < 0.001
-
-
-def _test_activation_dropout_and_linear():
-    in_channels = 20
-    out_channels = 30
-
-    for bias in [True, False]:
-        # actually we don't test for dropout_p != 0.0 because forward functions will give
-        # different answers.  This is because we are using the k2 implementation of
-        # swoosh_l an swoosh_r inside SwooshL() and SwooshR(), and they call randn()
-        # internally, messing up the random state.
-        for dropout_p in [0.0]:
-            for activation in ["SwooshL", "SwooshR"]:
-                m1 = nn.Sequential(
-                    SwooshL() if activation == "SwooshL" else SwooshR(),
-                    Dropout3(p=dropout_p, shared_dim=-1),
-                    ScaledLinear(
-                        in_channels, out_channels, bias=bias, initial_scale=0.5
-                    ),
-                )
-                m2 = ActivationDropoutAndLinear(
-                    in_channels,
-                    out_channels,
-                    bias=bias,
-                    initial_scale=0.5,
-                    activation=activation,
-                    dropout_p=dropout_p,
-                )
-                with torch.no_grad():
-                    m2.weight[:] = m1[2].weight
-                    if bias:
-                        m2.bias[:] = m1[2].bias
-                # make sure forward gives same result.
-                x1 = torch.randn(10, in_channels)
-                x1.requires_grad = True
-
-                # TEMP.
-                assert torch.allclose(
-                    SwooshRFunction.apply(x1), SwooshRForward(x1), atol=1.0e-03
-                )
-
-                x2 = x1.clone().detach()
-                x2.requires_grad = True
-                seed = 10
-                torch.manual_seed(seed)
-                y1 = m1(x1)
-                y_grad = torch.randn_like(y1)
-                y1.backward(gradient=y_grad)
-                torch.manual_seed(seed)
-                y2 = m2(x2)
-                y2.backward(gradient=y_grad)
-
-                print(
-                    f"bias = {bias}, dropout_p = {dropout_p}, activation = {activation}"
-                )
-                print("y1 = ", y1)
-                print("y2 = ", y2)
-                assert torch.allclose(y1, y2, atol=0.02)
-                assert torch.allclose(m1[2].weight.grad, m2.weight.grad, atol=1.0e-05)
-                if bias:
-                    assert torch.allclose(m1[2].bias.grad, m2.bias.grad, atol=1.0e-05)
-                print("x1.grad = ", x1.grad)
-                print("x2.grad = ", x2.grad)
-
-                def isclose(a, b):
-                    # return true if cosine similarity is > 0.9.
-                    return (a * b).sum() > 0.9 * (
-                        (a**2).sum() * (b**2).sum()
-                    ).sqrt()
-
-                # the SwooshL() implementation has a noisy gradient due to 1-byte
-                # storage of it.
-                assert isclose(x1.grad, x2.grad)
-
-
-if __name__ == "__main__":
-    logging.getLogger().setLevel(logging.INFO)
-    torch.set_num_threads(1)
-    torch.set_num_interop_threads(1)
-    _test_piecewise_linear()
-    _test_softmax()
-    _test_whiten()
-    _test_balancer_sign()
-    _test_balancer_magnitude()
-    _test_double_swish_deriv()
-    _test_swooshr_deriv()
-    _test_swooshl_deriv()
-    _test_activation_dropout_and_linear()
diff --git a/egs/librispeech/SSL/hubert/scaling.py b/egs/librispeech/SSL/hubert/scaling.py
new file mode 120000
index 000000000..e30bd99de
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/scaling.py
@@ -0,0 +1 @@
+../../ASR/zipformer/scaling.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/subsampling.py b/egs/librispeech/SSL/hubert/subsampling.py
deleted file mode 100644
index b2f769d3f..000000000
--- a/egs/librispeech/SSL/hubert/subsampling.py
+++ /dev/null
@@ -1,406 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.        (authors: Daniel Povey,
-#                                                  Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Tuple
-
-import torch
-from scaling import (
-    Balancer,
-    BiasNorm,
-    Dropout3,
-    FloatLike,
-    Optional,
-    ScaledConv2d,
-    ScaleGrad,
-    ScheduledFloat,
-    SwooshL,
-    SwooshR,
-    Whiten,
-)
-from torch import Tensor, nn
-
-
-class ConvNeXt(nn.Module):
-    """
-    Our interpretation of the ConvNeXt module as used in https://arxiv.org/pdf/2206.14747.pdf
-    """
-
-    def __init__(
-        self,
-        channels: int,
-        hidden_ratio: int = 3,
-        kernel_size: Tuple[int, int] = (7, 7),
-        layerdrop_rate: FloatLike = None,
-    ):
-        super().__init__()
-        self.padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2)
-        hidden_channels = channels * hidden_ratio
-        if layerdrop_rate is None:
-            layerdrop_rate = ScheduledFloat((0.0, 0.2), (20000.0, 0.015))
-        self.layerdrop_rate = layerdrop_rate
-
-        self.depthwise_conv = nn.Conv2d(
-            in_channels=channels,
-            out_channels=channels,
-            groups=channels,
-            kernel_size=kernel_size,
-            padding=self.padding,
-        )
-
-        self.pointwise_conv1 = nn.Conv2d(
-            in_channels=channels, out_channels=hidden_channels, kernel_size=1
-        )
-
-        self.hidden_balancer = Balancer(
-            hidden_channels,
-            channel_dim=1,
-            min_positive=0.3,
-            max_positive=1.0,
-            min_abs=0.75,
-            max_abs=5.0,
-        )
-
-        self.activation = SwooshL()
-        self.pointwise_conv2 = ScaledConv2d(
-            in_channels=hidden_channels,
-            out_channels=channels,
-            kernel_size=1,
-            initial_scale=0.01,
-        )
-
-        self.out_balancer = Balancer(
-            channels,
-            channel_dim=1,
-            min_positive=0.4,
-            max_positive=0.6,
-            min_abs=1.0,
-            max_abs=6.0,
-        )
-        self.out_whiten = Whiten(
-            num_groups=1,
-            whitening_limit=5.0,
-            prob=(0.025, 0.25),
-            grad_scale=0.01,
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
-            return self.forward_internal(x)
-        layerdrop_rate = float(self.layerdrop_rate)
-
-        if layerdrop_rate != 0.0:
-            batch_size = x.shape[0]
-            mask = (
-                torch.rand((batch_size, 1, 1, 1), dtype=x.dtype, device=x.device)
-                > layerdrop_rate
-            )
-        else:
-            mask = None
-        # turns out this caching idea does not work with --world-size > 1
-        # return caching_eval(self.forward_internal, x, mask)
-        return self.forward_internal(x, mask)
-
-    def forward_internal(
-        self, x: Tensor, layer_skip_mask: Optional[Tensor] = None
-    ) -> Tensor:
-        """
-        x layout: (N, C, H, W), i.e. (batch_size, num_channels, num_frames, num_freqs)
-
-        The returned value has the same shape as x.
-        """
-        bypass = x
-        x = self.depthwise_conv(x)
-        x = self.pointwise_conv1(x)
-        x = self.hidden_balancer(x)
-        x = self.activation(x)
-        x = self.pointwise_conv2(x)
-
-        if layer_skip_mask is not None:
-            x = x * layer_skip_mask
-
-        x = bypass + x
-        x = self.out_balancer(x)
-
-        if x.requires_grad:
-            x = x.transpose(1, 3)  # (N, W, H, C); need channel dim to be last
-            x = self.out_whiten(x)
-            x = x.transpose(1, 3)  # (N, C, H, W)
-
-        return x
-
-    def streaming_forward(
-        self,
-        x: Tensor,
-        cached_left_pad: Tensor,
-    ) -> Tuple[Tensor, Tensor]:
-        """
-        Args:
-            x layout: (N, C, H, W), i.e. (batch_size, num_channels, num_frames, num_freqs)
-            cached_left_pad: (batch_size, num_channels, left_pad, num_freqs)
-
-        Returns:
-            - The returned value has the same shape as x.
-            - Updated cached_left_pad.
-        """
-        padding = self.padding
-
-        # The length without right padding for depth-wise conv
-        T = x.size(2) - padding[0]
-
-        bypass = x[:, :, :T, :]
-
-        # Pad left side
-        assert cached_left_pad.size(2) == padding[0], (
-            cached_left_pad.size(2),
-            padding[0],
-        )
-        x = torch.cat([cached_left_pad, x], dim=2)
-        # Update cached left padding
-        cached_left_pad = x[:, :, T : padding[0] + T, :]
-
-        # depthwise_conv
-        x = torch.nn.functional.conv2d(
-            x,
-            weight=self.depthwise_conv.weight,
-            bias=self.depthwise_conv.bias,
-            padding=(0, padding[1]),
-            groups=self.depthwise_conv.groups,
-        )
-        x = self.pointwise_conv1(x)
-        x = self.hidden_balancer(x)
-        x = self.activation(x)
-        x = self.pointwise_conv2(x)
-
-        x = bypass + x
-        return x, cached_left_pad
-
-
-class Conv2dSubsampling(nn.Module):
-    """Convolutional 2D subsampling (to 1/2 length).
-
-    Convert an input of shape (N, T, idim) to an output
-    with shape (N, T', odim), where
-    T' = (T-3)//2 - 2 == (T-7)//2
-
-    It is based on
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        layer1_channels: int = 8,
-        layer2_channels: int = 32,
-        layer3_channels: int = 128,
-        dropout: FloatLike = 0.1,
-    ) -> None:
-        """
-        Args:
-          in_channels:
-            Number of channels in. The input shape is (N, T, in_channels).
-            Caution: It requires: T >=7, in_channels >=7
-          out_channels
-            Output dim. The output shape is (N, (T-3)//2, out_channels)
-          layer1_channels:
-            Number of channels in layer1
-          layer1_channels:
-            Number of channels in layer2
-          bottleneck:
-            bottleneck dimension for 1d squeeze-excite
-        """
-        assert in_channels >= 7
-        super().__init__()
-
-        # The ScaleGrad module is there to prevent the gradients
-        # w.r.t. the weight or bias of the first Conv2d module in self.conv from
-        # exceeding the range of fp16 when using automatic mixed precision (amp)
-        # training.  (The second one is necessary to stop its bias from getting
-        # a too-large gradient).
-
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                in_channels=1,
-                out_channels=layer1_channels,
-                kernel_size=3,
-                padding=(0, 1),  # (time, freq)
-            ),
-            ScaleGrad(0.2),
-            Balancer(layer1_channels, channel_dim=1, max_abs=1.0),
-            SwooshR(),
-            nn.Conv2d(
-                in_channels=layer1_channels,
-                out_channels=layer2_channels,
-                kernel_size=3,
-                stride=2,
-                padding=0,
-            ),
-            Balancer(layer2_channels, channel_dim=1, max_abs=4.0),
-            SwooshR(),
-            nn.Conv2d(
-                in_channels=layer2_channels,
-                out_channels=layer3_channels,
-                kernel_size=3,
-                stride=(1, 2),  # (time, freq)
-            ),
-            Balancer(layer3_channels, channel_dim=1, max_abs=4.0),
-            SwooshR(),
-        )
-
-        # just one convnext layer
-        self.convnext = ConvNeXt(layer3_channels, kernel_size=(7, 7))
-
-        # (in_channels-3)//4
-        self.out_width = (((in_channels - 1) // 2) - 1) // 2
-        self.layer3_channels = layer3_channels
-
-        self.out = nn.Linear(self.out_width * layer3_channels, out_channels)
-        # use a larger than normal grad_scale on this whitening module; there is
-        # only one such module, so there is not a concern about adding together
-        # many copies of this extra gradient term.
-        self.out_whiten = Whiten(
-            num_groups=1,
-            whitening_limit=ScheduledFloat((0.0, 4.0), (20000.0, 8.0), default=4.0),
-            prob=(0.025, 0.25),
-            grad_scale=0.02,
-        )
-
-        # max_log_eps=0.0 is to prevent both eps and the output of self.out from
-        # getting large, there is an unnecessary degree of freedom.
-        self.out_norm = BiasNorm(out_channels)
-        self.dropout = Dropout3(dropout, shared_dim=1)
-
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames in
-
-        Returns:
-          - a tensor of shape (N, (T-7)//2, odim)
-          - output lengths, of shape (batch_size,)
-        """
-        # On entry, x is (N, T, idim)
-        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
-        # scaling x by 0.1 allows us to use a larger grad-scale in fp16 "amp" (automatic mixed precision)
-        # training, since the weights in the first convolution are otherwise the limiting factor for getting infinite
-        # gradients.
-        x = self.conv(x)
-        x = self.convnext(x)
-
-        # Now x is of shape (N, odim, (T-7)//2, (idim-3)//4)
-        b, c, t, f = x.size()
-
-        x = x.transpose(1, 2).reshape(b, t, c * f)
-        # now x: (N, (T-7)//2, out_width * layer3_channels))
-
-        x = self.out(x)
-        # Now x is of shape (N, (T-7)//2, odim)
-        x = self.out_whiten(x)
-        x = self.out_norm(x)
-        x = self.dropout(x)
-
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            x_lens = (x_lens - 7) // 2
-        else:
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                x_lens = (x_lens - 7) // 2
-        assert x.size(1) == x_lens.max().item(), (x.size(1), x_lens.max())
-
-        return x, x_lens
-
-    def streaming_forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        cached_left_pad: Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames in
-
-        Returns:
-          - a tensor of shape (N, (T-7)//2, odim)
-          - output lengths, of shape (batch_size,)
-          - updated cache
-        """
-        # On entry, x is (N, T, idim)
-        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
-
-        # T' = (T-7)//2
-        x = self.conv(x)
-
-        # T' = (T-7)//2-3
-        x, cached_left_pad = self.convnext.streaming_forward(
-            x, cached_left_pad=cached_left_pad
-        )
-
-        # Now x is of shape (N, odim, T', ((idim-1)//2 - 1)//2)
-        b, c, t, f = x.size()
-
-        x = x.transpose(1, 2).reshape(b, t, c * f)
-        # now x: (N, T', out_width * layer3_channels))
-
-        x = self.out(x)
-        # Now x is of shape (N, T', odim)
-        x = self.out_norm(x)
-
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            assert self.convnext.padding[0] == 3
-            # The ConvNeXt module needs 3 frames of right padding after subsampling
-            x_lens = (x_lens - 7) // 2 - 3
-        else:
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                # The ConvNeXt module needs 3 frames of right padding after subsampling
-                assert self.convnext.padding[0] == 3
-                x_lens = (x_lens - 7) // 2 - 3
-
-        assert x.size(1) == x_lens.max().item(), (x.shape, x_lens.max())
-
-        return x, x_lens, cached_left_pad
-
-    @torch.jit.export
-    def get_init_states(
-        self,
-        batch_size: int = 1,
-        device: torch.device = torch.device("cpu"),
-    ) -> Tensor:
-        """Get initial states for Conv2dSubsampling module.
-        It is the cached left padding for ConvNeXt module,
-        of shape (batch_size, num_channels, left_pad, num_freqs)
-        """
-        left_pad = self.convnext.padding[0]
-        freq = self.out_width
-        channels = self.layer3_channels
-        cached_embed_left_pad = torch.zeros(batch_size, channels, left_pad, freq).to(
-            device
-        )
-
-        return cached_embed_left_pad