Update decoding script for gigaspeech and remove duplicate files.

2022-05-13 12:55:06 +08:00 · 2022-05-13 12:55:06 +08:00 · ba99671fba
commit ba99671fba
parent 48a6a9a549
11 changed files with 37 additions and 3251 deletions
--- a/README.md
+++ b/README.md
@ -12,13 +12,14 @@ for installation.
 Please refer to <https://icefall.readthedocs.io/en/latest/recipes/index.html>
 for more information.

-We provide four recipes at present:
+We provide 6 recipes at present:

  - [yesno][yesno]
  - [LibriSpeech][librispeech]
  - [Aishell][aishell]
  - [TIMIT][timit]
  - [TED-LIUM3][tedlium3]
+  - [GigaSpeech][gigaspeech]

 ### yesno

@ -197,6 +198,23 @@ The best WER using modified beam search with beam size 4 is:

 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)

+### GigaSpeech
+
+#### Conformer CTC
+
+|     |  Dev  | Test  |
+|-----|-------|-------|
+| WER | 10.47 | 10.58 |
+
+#### Pruned stateless RNN-T
+
+|                      |  Dev  | Test  |
+|----------------------|-------|-------|
+|    greedy search     | 10.59 | 10.87 |
+|   fast beam search   | 10.56 | 10.80 |
+| modified beam search | 10.52 | 10.62 |
+
+
 ## Deployment with C++

 Once you have trained a model in icefall, you may want to deploy it with C++,
@ -225,4 +243,5 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [aishell]: egs/aishell/ASR
 [timit]: egs/timit/ASR
 [tedlium3]: egs/tedlium3/ASR
+[gigaspeech]: egs/gigaspeech/ASR
 [k2]: https://github.com/k2-fsa/k2
--- a/egs/gigaspeech/ASR/RESULTS.md
+++ b/egs/gigaspeech/ASR/RESULTS.md
@ -5,10 +5,10 @@

 #### Conformer encoder + embedding decoder

-Conformer encoder + non-recurrent decoder. The encoder is a 
-reworked version of the conformer encoder, with many changes. The 
-decoder contains only an embedding layer, a Conv1d (with kernel 
-size 2) and a linear layer (to transform tensor dim). k2 pruned 
+Conformer encoder + non-recurrent decoder. The encoder is a
+reworked version of the conformer encoder, with many changes. The
+decoder contains only an embedding layer, a Conv1d (with kernel
+size 2) and a linear layer (to transform tensor dim). k2 pruned
 RNN-T loss is used.

 Results are:
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/beam_search.py
@ -1,766 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Dict, List, Optional
-
-import k2
-import torch
-from model import Transducer
-
-from icefall.decode import one_best_decoding
-from icefall.utils import get_texts
-
-
-def fast_beam_search(
-    model: Transducer,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-) -> List[List[int]]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-    Returns:
-      Return the decoded result.
-    """
-    assert encoder_out.ndim == 3
-
-    context_size = model.decoder.context_size
-    vocab_size = model.decoder.vocab_size
-
-    B, T, C = encoder_out.shape
-
-    config = k2.RnntDecodingConfig(
-        vocab_size=vocab_size,
-        decoder_history_len=context_size,
-        beam=beam,
-        max_contexts=max_contexts,
-        max_states=max_states,
-    )
-    individual_streams = []
-    for i in range(B):
-        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
-    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # shape is a RaggedShape of shape (B, context)
-        # contexts is a Tensor of shape (shape.NumElements(), context_size)
-        shape, contexts = decoding_streams.get_contexts()
-        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
-        contexts = contexts.to(torch.int64)
-        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
-        decoder_out = model.decoder(contexts, need_pad=False)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # current_encoder_out is of shape
-        # (shape.NumElements(), 1, joiner_dim)
-        # fmt: off
-        current_encoder_out = torch.index_select(
-            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
-        )
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out.unsqueeze(2),
-            decoder_out.unsqueeze(1),
-            project_input=False,
-        )
-        logits = logits.squeeze(1).squeeze(1)
-        log_probs = logits.log_softmax(dim=-1)
-        decoding_streams.advance(log_probs)
-    decoding_streams.terminate_and_flush_to_streams()
-    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
-
-    best_path = one_best_decoding(lattice)
-    hyps = get_texts(best_path)
-    return hyps
-
-
-def greedy_search(
-    model: Transducer, encoder_out: torch.Tensor, max_sym_per_frame: int
-) -> List[int]:
-    """Greedy search for a single utterance.
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      max_sym_per_frame:
-        Maximum number of symbols per frame. If it is set to 0, the WER
-        would be 100%.
-    Returns:
-      Return the decoded result.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-
-    device = model.device
-
-    decoder_input = torch.tensor(
-        [blank_id] * context_size, device=device, dtype=torch.int64
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-    hyp = [blank_id] * context_size
-
-    # Maximum symbols per utterance.
-    max_sym_per_utt = 1000
-
-    # symbols per frame
-    sym_per_frame = 0
-
-    # symbols per utterance decoded so far
-    sym_per_utt = 0
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        if sym_per_frame >= max_sym_per_frame:
-            sym_per_frame = 0
-            t += 1
-            continue
-
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits is (1, 1, 1, vocab_size)
-
-        y = logits.argmax().item()
-        if y != blank_id:
-            hyp.append(y)
-            decoder_input = torch.tensor(
-                [hyp[-context_size:]], device=device
-            ).reshape(1, context_size)
-
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-            sym_per_utt += 1
-            sym_per_frame += 1
-        else:
-            sym_per_frame = 0
-            t += 1
-    hyp = hyp[context_size:]  # remove blanks
-
-    return hyp
-
-
-def greedy_search_batch(
-    model: Transducer, encoder_out: torch.Tensor
-) -> List[List[int]]:
-    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C), where N >= 1.
-    Returns:
-      Return a list-of-list of token IDs containing the decoded results.
-      len(ans) equals to encoder_out.size(0).
-    """
-    assert encoder_out.ndim == 3
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    device = model.device
-
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-
-    hyps = [[blank_id] * context_size for _ in range(batch_size)]
-
-    decoder_input = torch.tensor(
-        hyps,
-        device=device,
-        dtype=torch.int64,
-    )  # (batch_size, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    # decoder_out: (batch_size, 1, decoder_out_dim)
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :].unsqueeze(2)  # noqa
-        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits'shape (batch_size, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
-        assert logits.ndim == 2, logits.shape
-        y = logits.argmax(dim=1).tolist()
-        emitted = False
-        for i, v in enumerate(y):
-            if v != blank_id:
-                hyps[i].append(v)
-                emitted = True
-        if emitted:
-            # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps]
-            decoder_input = torch.tensor(
-                decoder_input,
-                device=device,
-                dtype=torch.int64,
-            )
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    ans = [h[context_size:] for h in hyps]
-    return ans
-
-
-@dataclass
-class Hypothesis:
-    # The predicted tokens so far.
-    # Newly predicted tokens are appended to `ys`.
-    ys: List[int]
-
-    # The log prob of ys.
-    # It contains only one entry.
-    log_prob: torch.Tensor
-
-    @property
-    def key(self) -> str:
-        """Return a string representation of self.ys"""
-        return "_".join(map(str, self.ys))
-
-
-class HypothesisList(object):
-    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
-        """
-        Args:
-          data:
-            A dict of Hypotheses. Its key is its `value.key`.
-        """
-        if data is None:
-            self._data = {}
-        else:
-            self._data = data
-
-    @property
-    def data(self) -> Dict[str, Hypothesis]:
-        return self._data
-
-    def add(self, hyp: Hypothesis) -> None:
-        """Add a Hypothesis to `self`.
-
-        If `hyp` already exists in `self`, its probability is updated using
-        `log-sum-exp` with the existed one.
-
-        Args:
-          hyp:
-            The hypothesis to be added.
-        """
-        key = hyp.key
-        if key in self:
-            old_hyp = self._data[key]  # shallow copy
-            torch.logaddexp(
-                old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob
-            )
-        else:
-            self._data[key] = hyp
-
-    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
-        """Get the most probable hypothesis, i.e., the one with
-        the largest `log_prob`.
-
-        Args:
-          length_norm:
-            If True, the `log_prob` of a hypothesis is normalized by the
-            number of tokens in it.
-        Returns:
-          Return the hypothesis that has the largest `log_prob`.
-        """
-        if length_norm:
-            return max(
-                self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys)
-            )
-        else:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
-
-    def remove(self, hyp: Hypothesis) -> None:
-        """Remove a given hypothesis.
-
-        Caution:
-          `self` is modified **in-place**.
-
-        Args:
-          hyp:
-            The hypothesis to be removed from `self`.
-            Note: It must be contained in `self`. Otherwise,
-            an exception is raised.
-        """
-        key = hyp.key
-        assert key in self, f"{key} does not exist"
-        del self._data[key]
-
-    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
-        """Remove all Hypotheses whose log_prob is less than threshold.
-
-        Caution:
-          `self` is not modified. Instead, a new HypothesisList is returned.
-
-        Returns:
-          Return a new HypothesisList containing all hypotheses from `self`
-          with `log_prob` being greater than the given `threshold`.
-        """
-        ans = HypothesisList()
-        for _, hyp in self._data.items():
-            if hyp.log_prob > threshold:
-                ans.add(hyp)  # shallow copy
-        return ans
-
-    def topk(self, k: int) -> "HypothesisList":
-        """Return the top-k hypothesis."""
-        hyps = list(self._data.items())
-
-        hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
-
-        ans = HypothesisList(dict(hyps))
-        return ans
-
-    def __contains__(self, key: str):
-        return key in self._data
-
-    def __iter__(self):
-        return iter(self._data.values())
-
-    def __len__(self) -> int:
-        return len(self._data)
-
-    def __str__(self) -> str:
-        s = []
-        for key in self:
-            s.append(key)
-        return ", ".join(s)
-
-
-def _get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
-    """Return a ragged shape with axes [utt][num_hyps].
-
-    Args:
-      hyps:
-        len(hyps) == batch_size. It contains the current hypothesis for
-        each utterance in the batch.
-    Returns:
-      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
-      the shape is on CPU.
-    """
-    num_hyps = [len(h) for h in hyps]
-
-    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
-    # to get exclusive sum later.
-    num_hyps.insert(0, 0)
-
-    num_hyps = torch.tensor(num_hyps)
-    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
-    ans = k2.ragged.create_ragged_shape2(
-        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
-    )
-    return ans
-
-
-def modified_beam_search(
-    model: Transducer,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-) -> List[List[int]]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      beam:
-        Number of active paths during the beam search.
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-    device = model.device
-    B = [HypothesisList() for _ in range(batch_size)]
-    for i in range(batch_size):
-        B[i].add(
-            Hypothesis(
-                ys=[blank_id] * context_size,
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :].unsqueeze(2)  # noqa
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-
-        hyps_shape = _get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(
-            shape=log_probs_shape, value=log_probs
-        )
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-            topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                if new_token != blank_id:
-                    new_ys.append(new_token)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
-                B[i].add(new_hyp)
-
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-    ans = [h.ys[context_size:] for h in best_hyps]
-
-    return ans
-
-
-def _deprecated_modified_beam_search(
-    model: Transducer,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-) -> List[int]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    It decodes only one utterance at a time. We keep it only for reference.
-    The function :func:`modified_beam_search` should be preferred as it
-    supports batch decoding.
-
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-    Returns:
-      Return the decoded result.
-    """
-
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-
-    device = model.device
-
-    T = encoder_out.size(1)
-
-    B = HypothesisList()
-    B.add(
-        Hypothesis(
-            ys=[blank_id] * context_size,
-            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-        )
-    )
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # current_encoder_out is of shape (1, 1, 1, encoder_out_dim)
-        # fmt: on
-        A = list(B)
-        B = HypothesisList()
-
-        ys_log_probs = torch.cat([hyp.log_prob.reshape(1, 1) for hyp in A])
-        # ys_log_probs is of shape (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyp in A],
-            device=device,
-            dtype=torch.int64,
-        )
-        # decoder_input is of shape (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_output is of shape (num_hyps, 1, 1, joiner_dim)
-
-        current_encoder_out = current_encoder_out.expand(
-            decoder_out.size(0), 1, 1, -1
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )
-        # logits is of shape (num_hyps, 1, 1, vocab_size)
-        logits = logits.squeeze(1).squeeze(1)
-
-        # now logits is of shape (num_hyps, vocab_size)
-        log_probs = logits.log_softmax(dim=-1)
-
-        log_probs.add_(ys_log_probs)
-
-        log_probs = log_probs.reshape(-1)
-        topk_log_probs, topk_indexes = log_probs.topk(beam)
-
-        # topk_hyp_indexes are indexes into `A`
-        topk_hyp_indexes = topk_indexes // logits.size(-1)
-        topk_token_indexes = topk_indexes % logits.size(-1)
-
-        topk_hyp_indexes = topk_hyp_indexes.tolist()
-        topk_token_indexes = topk_token_indexes.tolist()
-
-        for i in range(len(topk_hyp_indexes)):
-            hyp = A[topk_hyp_indexes[i]]
-            new_ys = hyp.ys[:]
-            new_token = topk_token_indexes[i]
-            if new_token != blank_id:
-                new_ys.append(new_token)
-            new_log_prob = topk_log_probs[i]
-            new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
-            B.add(new_hyp)
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-
-    return ys
-
-
-def beam_search(
-    model: Transducer,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-) -> List[int]:
-    """
-    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
-
-    espnet/nets/beam_search_transducer.py#L247 is used as a reference.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-    Returns:
-      Return the decoded result.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-
-    device = model.device
-
-    decoder_input = torch.tensor(
-        [blank_id] * context_size,
-        device=device,
-        dtype=torch.int64,
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-
-    B = HypothesisList()
-    B.add(Hypothesis(ys=[blank_id] * context_size, log_prob=0.0))
-
-    max_sym_per_utt = 20000
-
-    sym_per_utt = 0
-
-    decoder_cache: Dict[str, torch.Tensor] = {}
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        A = B
-        B = HypothesisList()
-
-        joint_cache: Dict[str, torch.Tensor] = {}
-
-        # TODO(fangjun): Implement prefix search to update the `log_prob`
-        # of hypotheses in A
-
-        while True:
-            y_star = A.get_most_probable()
-            A.remove(y_star)
-
-            cached_key = y_star.key
-
-            if cached_key not in decoder_cache:
-                decoder_input = torch.tensor(
-                    [y_star.ys[-context_size:]],
-                    device=device,
-                    dtype=torch.int64,
-                ).reshape(1, context_size)
-
-                decoder_out = model.decoder(decoder_input, need_pad=False)
-                decoder_out = model.joiner.decoder_proj(decoder_out)
-                decoder_cache[cached_key] = decoder_out
-            else:
-                decoder_out = decoder_cache[cached_key]
-
-            cached_key += f"-t-{t}"
-            if cached_key not in joint_cache:
-                logits = model.joiner(
-                    current_encoder_out,
-                    decoder_out.unsqueeze(1),
-                    project_input=False,
-                )
-
-                # TODO(fangjun): Scale the blank posterior
-                log_prob = logits.log_softmax(dim=-1)
-                # log_prob is (1, 1, 1, vocab_size)
-                log_prob = log_prob.squeeze()
-                # Now log_prob is (vocab_size,)
-                joint_cache[cached_key] = log_prob
-            else:
-                log_prob = joint_cache[cached_key]
-
-            # First, process the blank symbol
-            skip_log_prob = log_prob[blank_id]
-            new_y_star_log_prob = y_star.log_prob + skip_log_prob
-
-            # ys[:] returns a copy of ys
-            B.add(Hypothesis(ys=y_star.ys[:], log_prob=new_y_star_log_prob))
-
-            # Second, process other non-blank labels
-            values, indices = log_prob.topk(beam + 1)
-            for i, v in zip(indices.tolist(), values.tolist()):
-                if i == blank_id:
-                    continue
-                new_ys = y_star.ys + [i]
-                new_log_prob = y_star.log_prob + v
-                A.add(Hypothesis(ys=new_ys, log_prob=new_log_prob))
-
-            # Check whether B contains more than "beam" elements more probable
-            # than the most probable in A
-            A_most_probable = A.get_most_probable()
-
-            kept_B = B.filter(A_most_probable.log_prob)
-
-            if len(kept_B) >= beam:
-                B = kept_B.topk(beam)
-                break
-
-        t += 1
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-    return ys
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/beam_search.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/conformer.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/conformer.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/conformer.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
@ -69,7 +69,7 @@ import torch.nn as nn
 from asr_datamodule import GigaSpeechAsrDataModule
 from beam_search import (
    beam_search,
-    fast_beam_search,
+    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
@ -263,7 +263,7 @@ def decode_one_batch(
    hyps = []

    if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
+        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
@ -281,6 +281,7 @@ def decode_one_batch(
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
@ -288,6 +289,7 @@ def decode_one_batch(
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
            beam=params.beam_size,
        )
        for hyp in sp.decode(hyp_tokens):
@ -366,7 +368,7 @@ def decode_dataset(
    except TypeError:
        num_batches = "?"

-    log_interval = 100
+    log_interval = 20

    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/decoder.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/decoder.py
@ -1,103 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from scaling import ScaledConv1d, ScaledEmbedding
-
-
-class Decoder(nn.Module):
-    """This class modifies the stateless decoder from the following paper:
-
-        RNN-transducer with stateless prediction network
-        https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
-
-    It removes the recurrent connection from the decoder, i.e., the prediction
-    network. Different from the above paper, it adds an extra Conv1d
-    right after the embedding layer.
-
-    TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
-    """
-
-    def __init__(
-        self,
-        vocab_size: int,
-        decoder_dim: int,
-        blank_id: int,
-        context_size: int,
-    ):
-        """
-        Args:
-          vocab_size:
-            Number of tokens of the modeling unit including blank.
-          decoder_dim:
-            Dimension of the input embedding, and of the decoder output.
-          blank_id:
-            The ID of the blank symbol.
-          context_size:
-            Number of previous words to use to predict the next word.
-            1 means bigram; 2 means trigram. n means (n+1)-gram.
-        """
-        super().__init__()
-
-        self.embedding = ScaledEmbedding(
-            num_embeddings=vocab_size,
-            embedding_dim=decoder_dim,
-            padding_idx=blank_id,
-        )
-        self.blank_id = blank_id
-
-        assert context_size >= 1, context_size
-        self.context_size = context_size
-        self.vocab_size = vocab_size
-        if context_size > 1:
-            self.conv = ScaledConv1d(
-                in_channels=decoder_dim,
-                out_channels=decoder_dim,
-                kernel_size=context_size,
-                padding=0,
-                groups=decoder_dim,
-                bias=False,
-            )
-
-    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
-        """
-        Args:
-          y:
-            A 2-D tensor of shape (N, U).
-          need_pad:
-            True to left pad the input. Should be True during training.
-            False to not pad the input. Should be False during inference.
-        Returns:
-          Return a tensor of shape (N, U, decoder_dim).
-        """
-        y = y.to(torch.int64)
-        embedding_out = self.embedding(y)
-        if self.context_size > 1:
-            embedding_out = embedding_out.permute(0, 2, 1)
-            if need_pad is True:
-                embedding_out = F.pad(
-                    embedding_out, pad=(self.context_size - 1, 0)
-                )
-            else:
-                # During inference time, there is no need to do extra padding
-                # as we only need one output
-                assert embedding_out.size(-1) == self.context_size
-            embedding_out = self.conv(embedding_out)
-            embedding_out = embedding_out.permute(0, 2, 1)
-        embedding_out = F.relu(embedding_out)
-        return embedding_out
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/decoder.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/decoder.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/decoder.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/encoder_interface.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/encoder_interface.py
@ -1,43 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-import torch.nn as nn
-
-
-class EncoderInterface(nn.Module):
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-          x:
-            A tensor of shape (batch_size, input_seq_len, num_features)
-            containing the input features.
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames
-            in `x` before padding.
-        Returns:
-          Return a tuple containing two tensors:
-            - encoder_out, a tensor of (batch_size, out_seq_len, output_dim)
-              containing unnormalized probabilities, i.e., the output of a
-              linear layer.
-            - encoder_out_lens, a tensor of shape (batch_size,) containing
-              the number of frames in `encoder_out` before padding.
-        """
-        raise NotImplementedError("Please implement it in a subclass")
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/encoder_interface.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/encoder_interface.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/encoder_interface.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/joiner.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/joiner.py
@ -1,67 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-from scaling import ScaledLinear
-
-
-class Joiner(nn.Module):
-    def __init__(
-        self,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        super().__init__()
-
-        self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim)
-        self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim)
-        self.output_linear = ScaledLinear(joiner_dim, vocab_size)
-
-    def forward(
-        self,
-        encoder_out: torch.Tensor,
-        decoder_out: torch.Tensor,
-        project_input: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-          encoder_out:
-            Output from the encoder. Its shape is (N, T, s_range, C).
-          decoder_out:
-            Output from the decoder. Its shape is (N, T, s_range, C).
-           project_input:
-            If true, apply input projections encoder_proj and decoder_proj.
-            If this is false, it is the user's responsibility to do this
-            manually.
-        Returns:
-          Return a tensor of shape (N, T, s_range, C).
-        """
-        assert encoder_out.ndim == decoder_out.ndim == 4
-        assert encoder_out.shape[:-1] == decoder_out.shape[:-1]
-
-        if project_input:
-            logit = self.encoder_proj(encoder_out) + self.decoder_proj(
-                decoder_out
-            )
-        else:
-            logit = encoder_out + decoder_out
-
-        logit = self.output_linear(torch.tanh(logit))
-
-        return logit
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/joiner.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/joiner.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/joiner.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/model.py
@ -1,193 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import k2
-import torch
-import torch.nn as nn
-from encoder_interface import EncoderInterface
-from scaling import ScaledLinear
-
-from icefall.utils import add_sos
-
-
-class Transducer(nn.Module):
-    """It implements https://arxiv.org/pdf/1211.3711.pdf
-    "Sequence Transduction with Recurrent Neural Networks"
-    """
-
-    def __init__(
-        self,
-        encoder: EncoderInterface,
-        decoder: nn.Module,
-        joiner: nn.Module,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        """
-        Args:
-          encoder:
-            It is the transcription network in the paper. Its accepts
-            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
-            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
-            `logit_lens` of shape (N,).
-          decoder:
-            It is the prediction network in the paper. Its input shape
-            is (N, U) and its output shape is (N, U, decoder_dim).
-            It should contain one attribute: `blank_id`.
-          joiner:
-            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
-            Its output shape is (N, T, U, vocab_size). Note that its output contains
-            unnormalized probs, i.e., not processed by log-softmax.
-        """
-        super().__init__()
-        assert isinstance(encoder, EncoderInterface), type(encoder)
-        assert hasattr(decoder, "blank_id")
-
-        self.encoder = encoder
-        self.decoder = decoder
-        self.joiner = joiner
-
-        self.simple_am_proj = ScaledLinear(
-            encoder_dim, vocab_size, initial_speed=0.5
-        )
-        self.simple_lm_proj = ScaledLinear(decoder_dim, vocab_size)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: k2.RaggedTensor,
-        prune_range: int = 5,
-        am_scale: float = 0.0,
-        lm_scale: float = 0.0,
-        warmup: float = 1.0,
-    ) -> torch.Tensor:
-        """
-        Args:
-          x:
-            A 3-D tensor of shape (N, T, C).
-          x_lens:
-            A 1-D tensor of shape (N,). It contains the number of frames in `x`
-            before padding.
-          y:
-            A ragged tensor with 2 axes [utt][label]. It contains labels of each
-            utterance.
-          prune_range:
-            The prune range for rnnt loss, it means how many symbols(context)
-            we are considering for each frame to compute the loss.
-          am_scale:
-            The scale to smooth the loss with am (output of encoder network)
-            part
-          lm_scale:
-            The scale to smooth the loss with lm (output of predictor network)
-            part
-          warmup:
-            A value warmup >= 0 that determines which modules are active, values
-            warmup > 1 "are fully warmed up" and all modules will be active.
-        Returns:
-          Return the transducer loss.
-
-        Note:
-           Regarding am_scale & lm_scale, it will make the loss-function one of
-           the form:
-              lm_scale * lm_probs + am_scale * am_probs +
-              (1-lm_scale-am_scale) * combined_probs
-        """
-        assert x.ndim == 3, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert y.num_axes == 2, y.num_axes
-
-        assert x.size(0) == x_lens.size(0) == y.dim0
-
-        encoder_out, x_lens = self.encoder(x, x_lens, warmup=warmup)
-        assert torch.all(x_lens > 0)
-
-        # Now for the decoder, i.e., the prediction network
-        row_splits = y.shape.row_splits(1)
-        y_lens = row_splits[1:] - row_splits[:-1]
-
-        blank_id = self.decoder.blank_id
-        sos_y = add_sos(y, sos_id=blank_id)
-
-        # sos_y_padded: [B, S + 1], start with SOS.
-        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
-
-        # decoder_out: [B, S + 1, decoder_dim]
-        decoder_out = self.decoder(sos_y_padded)
-
-        # Note: y does not start with SOS
-        # y_padded : [B, S]
-        y_padded = y.pad(mode="constant", padding_value=0)
-
-        y_padded = y_padded.to(torch.int64)
-        boundary = torch.zeros(
-            (x.size(0), 4), dtype=torch.int64, device=x.device
-        )
-        boundary[:, 2] = y_lens
-        boundary[:, 3] = x_lens
-
-        lm = self.simple_lm_proj(decoder_out)
-        am = self.simple_am_proj(encoder_out)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
-                lm=lm.float(),
-                am=am.float(),
-                symbols=y_padded,
-                termination_symbol=blank_id,
-                lm_only_scale=lm_scale,
-                am_only_scale=am_scale,
-                boundary=boundary,
-                reduction="sum",
-                return_grad=True,
-            )
-
-        # ranges : [B, T, prune_range]
-        ranges = k2.get_rnnt_prune_ranges(
-            px_grad=px_grad,
-            py_grad=py_grad,
-            boundary=boundary,
-            s_range=prune_range,
-        )
-
-        # am_pruned : [B, T, prune_range, encoder_dim]
-        # lm_pruned : [B, T, prune_range, decoder_dim]
-        am_pruned, lm_pruned = k2.do_rnnt_pruning(
-            am=self.joiner.encoder_proj(encoder_out),
-            lm=self.joiner.decoder_proj(decoder_out),
-            ranges=ranges,
-        )
-
-        # logits : [B, T, prune_range, vocab_size]
-
-        # project_input=False since we applied the decoder's input projections
-        # prior to do_rnnt_pruning (this is an optimization for speed).
-        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            pruned_loss = k2.rnnt_loss_pruned(
-                logits=logits.float(),
-                symbols=y_padded,
-                ranges=ranges,
-                termination_symbol=blank_id,
-                boundary=boundary,
-                reduction="sum",
-            )
-
-        return (simple_loss, pruned_loss)
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/model.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/model.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/optim.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/optim.py
@ -1,331 +0,0 @@
-# Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import List, Optional, Union
-
-import torch
-from torch.optim import Optimizer
-
-
-class Eve(Optimizer):
-    r"""
-    Implements Eve algorithm.  This is a modified version of AdamW with a special
-    way of setting the weight-decay / shrinkage-factor, which is designed to make the
-    rms of the parameters approach a particular target_rms (default: 0.1).  This is
-    for use with networks with 'scaled' versions of modules (see scaling.py), which
-    will be close to invariant to the absolute scale on the parameter matrix.
-
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-    Eve is unpublished so far.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
-            this value means that the weight would decay significantly after
-            about 3k minibatches.  Is not multiplied by learning rate, but
-            is conditional on RMS-value of parameter being > target_rms.
-        target_rms (float, optional): target root-mean-square value of
-           parameters, if they fall below this we will stop applying weight decay.
-
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.98),
-        eps=1e-8,
-        weight_decay=1e-3,
-        target_rms=0.1,
-    ):
-
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
-        if not 0 <= weight_decay <= 0.1:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
-        if not 0 < target_rms <= 10.0:
-            raise ValueError("Invalid target_rms value: {}".format(target_rms))
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            target_rms=target_rms,
-        )
-        super(Eve, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Eve, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-
-                # Perform optimization step
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        "AdamW does not support sparse gradients"
-                    )
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                denom = (exp_avg_sq.sqrt() * (bias_correction2 ** -0.5)).add_(
-                    group["eps"]
-                )
-
-                step_size = group["lr"] / bias_correction1
-                target_rms = group["target_rms"]
-                weight_decay = group["weight_decay"]
-
-                if p.numel() > 1:
-                    # avoid applying this weight-decay on "scaling factors"
-                    # (which are scalar).
-                    is_above_target_rms = p.norm() > (
-                        target_rms * (p.numel() ** 0.5)
-                    )
-                    p.mul_(1 - (weight_decay * is_above_target_rms))
-                p.addcdiv_(exp_avg, denom, value=-step_size)
-
-        return loss
-
-
-class LRScheduler(object):
-    """
-    Base-class for learning rate schedulers where the learning-rate depends on both the
-    batch and the epoch.
-    """
-
-    def __init__(self, optimizer: Optimizer, verbose: bool = False):
-        # Attach optimizer
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError(
-                "{} is not an Optimizer".format(type(optimizer).__name__)
-            )
-        self.optimizer = optimizer
-        self.verbose = verbose
-
-        for group in optimizer.param_groups:
-            group.setdefault("initial_lr", group["lr"])
-
-        self.base_lrs = [
-            group["initial_lr"] for group in optimizer.param_groups
-        ]
-
-        self.epoch = 0
-        self.batch = 0
-
-    def state_dict(self):
-        """Returns the state of the scheduler as a :class:`dict`.
-
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
-        """
-        return {
-            "base_lrs": self.base_lrs,
-            "epoch": self.epoch,
-            "batch": self.batch,
-        }
-
-    def load_state_dict(self, state_dict):
-        """Loads the schedulers state.
-
-        Args:
-            state_dict (dict): scheduler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-
-    def get_last_lr(self) -> List[float]:
-        """Return last computed learning rate by current scheduler.  Will be a list of float."""
-        return self._last_lr
-
-    def get_lr(self):
-        # Compute list of learning rates from self.epoch and self.batch and
-        # self.base_lrs; this must be overloaded by the user.
-        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
-        raise NotImplementedError
-
-    def step_batch(self, batch: Optional[int] = None) -> None:
-        # Step the batch index, or just set it.  If `batch` is specified, it
-        # must be the batch index from the start of training, i.e. summed over
-        # all epochs.
-        # You can call this in any order; if you don't provide 'batch', it should
-        # of course be called once per batch.
-        if batch is not None:
-            self.batch = batch
-        else:
-            self.batch = self.batch + 1
-        self._set_lrs()
-
-    def step_epoch(self, epoch: Optional[int] = None):
-        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
-        # you should call this at the start of the epoch; if you don't provide the 'epoch'
-        # arg, you should call it at the end of the epoch.
-        if epoch is not None:
-            self.epoch = epoch
-        else:
-            self.epoch = self.epoch + 1
-        self._set_lrs()
-
-    def _set_lrs(self):
-        values = self.get_lr()
-        assert len(values) == len(self.optimizer.param_groups)
-
-        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
-            param_group, lr = data
-            param_group["lr"] = lr
-            self.print_lr(self.verbose, i, lr)
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
-
-    def print_lr(self, is_verbose, group, lr):
-        """Display the current learning rate."""
-        if is_verbose:
-            print(
-                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
-                f" of group {group} to {lr:.4e}."
-            )
-
-
-class Eden(LRScheduler):
-    """
-    Eden scheduler.
-     lr = initial_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
-                       (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25))
-
-     E.g. suggest initial-lr = 0.003 (passed to optimizer).
-
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-        lr_epochs: the number of epochs after which we start significantly
-              decreasing the learning rate, suggest 6 if you plan to do e.g.
-              20 to 40 epochs, but may need smaller number if dataset is huge
-              and you will do few epochs.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        lr_epochs: Union[int, float],
-        verbose: bool = False,
-    ):
-        super(Eden, self).__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.lr_epochs = lr_epochs
-
-    def get_lr(self):
-        factor = (
-            (self.batch ** 2 + self.lr_batches ** 2) / self.lr_batches ** 2
-        ) ** -0.25 * (
-            ((self.epoch ** 2 + self.lr_epochs ** 2) / self.lr_epochs ** 2)
-            ** -0.25
-        )
-        return [x * factor for x in self.base_lrs]
-
-
-def _test_eden():
-    m = torch.nn.Linear(100, 100)
-    optim = Eve(m.parameters(), lr=0.003)
-
-    scheduler = Eden(optim, lr_batches=30, lr_epochs=2, verbose=True)
-
-    for epoch in range(10):
-        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
-
-        for step in range(20):
-            x = torch.randn(200, 100).detach()
-            x.requires_grad = True
-            y = m(x)
-            dy = torch.randn(200, 100).detach()
-            f = (y * dy).sum()
-            f.backward()
-
-            optim.step()
-            scheduler.step_batch()
-            optim.zero_grad()
-    print("last lr = ", scheduler.get_last_lr())
-    print("state dict = ", scheduler.state_dict())
-
-
-if __name__ == "__main__":
-    _test_eden()
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/optim.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/optim.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/optim.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/scaling.py
@ -1,702 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import collections
-from itertools import repeat
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.Iterable):
-            return x
-        return tuple(repeat(x, n))
-
-    return parse
-
-
-_single = _ntuple(1)
-_pair = _ntuple(2)
-
-
-class ActivationBalancerFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        channel_dim: int,
-        min_positive: float,  # e.g. 0.05
-        max_positive: float,  # e.g. 0.95
-        max_factor: float,  # e.g. 0.01
-        min_abs: float,  # e.g. 0.2
-        max_abs: float,  # e.g. 100.0
-    ) -> Tensor:
-        if x.requires_grad:
-            if channel_dim < 0:
-                channel_dim += x.ndim
-            sum_dims = [d for d in range(x.ndim) if d != channel_dim]
-            xgt0 = x > 0
-            proportion_positive = torch.mean(
-                xgt0.to(x.dtype), dim=sum_dims, keepdim=True
-            )
-            factor1 = (
-                (min_positive - proportion_positive).relu()
-                * (max_factor / min_positive)
-                if min_positive != 0.0
-                else 0.0
-            )
-            factor2 = (
-                (proportion_positive - max_positive).relu()
-                * (max_factor / (max_positive - 1.0))
-                if max_positive != 1.0
-                else 0.0
-            )
-            factor = factor1 + factor2
-            if isinstance(factor, float):
-                factor = torch.zeros_like(proportion_positive)
-
-            mean_abs = torch.mean(x.abs(), dim=sum_dims, keepdim=True)
-            below_threshold = mean_abs < min_abs
-            above_threshold = mean_abs > max_abs
-
-            ctx.save_for_backward(
-                factor, xgt0, below_threshold, above_threshold
-            )
-            ctx.max_factor = max_factor
-            ctx.sum_dims = sum_dims
-        return x
-
-    @staticmethod
-    def backward(
-        ctx, x_grad: Tensor
-    ) -> Tuple[Tensor, None, None, None, None, None, None]:
-        factor, xgt0, below_threshold, above_threshold = ctx.saved_tensors
-        dtype = x_grad.dtype
-        scale_factor = (
-            (below_threshold.to(dtype) - above_threshold.to(dtype))
-            * (xgt0.to(dtype) - 0.5)
-            * (ctx.max_factor * 2.0)
-        )
-
-        neg_delta_grad = x_grad.abs() * (factor + scale_factor)
-        return x_grad - neg_delta_grad, None, None, None, None, None, None
-
-
-class BasicNorm(torch.nn.Module):
-    """
-    This is intended to be a simpler, and hopefully cheaper, replacement for
-    LayerNorm.  The observation this is based on, is that Transformer-type
-    networks, especially with pre-norm, sometimes seem to set one of the
-    feature dimensions to a large constant value (e.g. 50), which "defeats"
-    the LayerNorm because the output magnitude is then not strongly dependent
-    on the other (useful) features.  Presumably the weight and bias of the
-    LayerNorm are required to allow it to do this.
-
-    So the idea is to introduce this large constant value as an explicit
-    parameter, that takes the role of the "eps" in LayerNorm, so the network
-    doesn't have to do this trick.  We make the "eps" learnable.
-
-    Args:
-       num_channels: the number of channels, e.g. 512.
-      channel_dim: the axis/dimension corresponding to the channel,
-        interprted as an offset from the input's ndim if negative.
-        shis is NOT the num_channels; it should typically be one of
-        {-2, -1, 0, 1, 2, 3}.
-       eps: the initial "epsilon" that we add as ballast in:
-             scale = ((input_vec**2).mean() + epsilon)**-0.5
-          Note: our epsilon is actually large, but we keep the name
-          to indicate the connection with conventional LayerNorm.
-       learn_eps: if true, we learn epsilon; if false, we keep it
-         at the initial value.
-    """
-
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int = -1,  # CAUTION: see documentation.
-        eps: float = 0.25,
-        learn_eps: bool = True,
-    ) -> None:
-        super(BasicNorm, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        if learn_eps:
-            self.eps = nn.Parameter(torch.tensor(eps).log().detach())
-        else:
-            self.register_buffer("eps", torch.tensor(eps).log().detach())
-
-    def forward(self, x: Tensor) -> Tensor:
-        assert x.shape[self.channel_dim] == self.num_channels
-        scales = (
-            torch.mean(x ** 2, dim=self.channel_dim, keepdim=True)
-            + self.eps.exp()
-        ) ** -0.5
-        return x * scales
-
-
-class ScaledLinear(nn.Linear):
-    """
-    A modified version of nn.Linear where the parameters are scaled before
-    use, via:
-         weight = self.weight * self.weight_scale.exp()
-         bias = self.bias * self.bias_scale.exp()
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-        initial_speed: this affects how fast the parameter will
-           learn near the start of training; you can set it to a
-           value less than one if you suspect that a module
-           is contributing to instability near the start of training.
-           Nnote: regardless of the use of this option, it's best to
-           use schedulers like Noam that have a warm-up period.
-           Alternatively you can set it to more than 1 if you want it to
-           initially train faster.   Must be greater than 0.
-    """
-
-    def __init__(
-        self,
-        *args,
-        initial_scale: float = 1.0,
-        initial_speed: float = 1.0,
-        **kwargs
-    ):
-        super(ScaledLinear, self).__init__(*args, **kwargs)
-        initial_scale = torch.tensor(initial_scale).log()
-        self.weight_scale = nn.Parameter(initial_scale.clone().detach())
-        if self.bias is not None:
-            self.bias_scale = nn.Parameter(initial_scale.clone().detach())
-        else:
-            self.register_parameter("bias_scale", None)
-
-        self._reset_parameters(
-            initial_speed
-        )  # Overrides the reset_parameters in nn.Linear
-
-    def _reset_parameters(self, initial_speed: float):
-        std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
-        nn.init.uniform_(self.weight, -a, a)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-        fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
-        with torch.no_grad():
-            self.weight_scale += torch.tensor(scale / std).log()
-
-    def get_weight(self):
-        return self.weight * self.weight_scale.exp()
-
-    def get_bias(self):
-        return None if self.bias is None else self.bias * self.bias_scale.exp()
-
-    def forward(self, input: Tensor) -> Tensor:
-        return torch.nn.functional.linear(
-            input, self.get_weight(), self.get_bias()
-        )
-
-
-class ScaledConv1d(nn.Conv1d):
-    # See docs for ScaledLinear
-    def __init__(
-        self,
-        *args,
-        initial_scale: float = 1.0,
-        initial_speed: float = 1.0,
-        **kwargs
-    ):
-        super(ScaledConv1d, self).__init__(*args, **kwargs)
-        initial_scale = torch.tensor(initial_scale).log()
-        self.weight_scale = nn.Parameter(initial_scale.clone().detach())
-        if self.bias is not None:
-            self.bias_scale = nn.Parameter(initial_scale.clone().detach())
-        else:
-            self.register_parameter("bias_scale", None)
-        self._reset_parameters(
-            initial_speed
-        )  # Overrides the reset_parameters in base class
-
-    def _reset_parameters(self, initial_speed: float):
-        std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
-        nn.init.uniform_(self.weight, -a, a)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-        fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
-        with torch.no_grad():
-            self.weight_scale += torch.tensor(scale / std).log()
-
-    def get_weight(self):
-        return self.weight * self.weight_scale.exp()
-
-    def get_bias(self):
-        return None if self.bias is None else self.bias * self.bias_scale.exp()
-
-    def forward(self, input: Tensor) -> Tensor:
-        F = torch.nn.functional
-        if self.padding_mode != "zeros":
-            return F.conv1d(
-                F.pad(
-                    input,
-                    self._reversed_padding_repeated_twice,
-                    mode=self.padding_mode,
-                ),
-                self.get_weight(),
-                self.get_bias(),
-                self.stride,
-                _single(0),
-                self.dilation,
-                self.groups,
-            )
-        return F.conv1d(
-            input,
-            self.get_weight(),
-            self.get_bias(),
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-        )
-
-
-class ScaledConv2d(nn.Conv2d):
-    # See docs for ScaledLinear
-    def __init__(
-        self,
-        *args,
-        initial_scale: float = 1.0,
-        initial_speed: float = 1.0,
-        **kwargs
-    ):
-        super(ScaledConv2d, self).__init__(*args, **kwargs)
-        initial_scale = torch.tensor(initial_scale).log()
-        self.weight_scale = nn.Parameter(initial_scale.clone().detach())
-        if self.bias is not None:
-            self.bias_scale = nn.Parameter(initial_scale.clone().detach())
-        else:
-            self.register_parameter("bias_scale", None)
-        self._reset_parameters(
-            initial_speed
-        )  # Overrides the reset_parameters in base class
-
-    def _reset_parameters(self, initial_speed: float):
-        std = 0.1 / initial_speed
-        a = (3 ** 0.5) * std
-        nn.init.uniform_(self.weight, -a, a)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-        fan_in = self.weight.shape[1] * self.weight[0][0].numel()
-        scale = fan_in ** -0.5  # 1/sqrt(fan_in)
-        with torch.no_grad():
-            self.weight_scale += torch.tensor(scale / std).log()
-
-    def get_weight(self):
-        return self.weight * self.weight_scale.exp()
-
-    def get_bias(self):
-        return None if self.bias is None else self.bias * self.bias_scale.exp()
-
-    def _conv_forward(self, input, weight):
-        F = torch.nn.functional
-        if self.padding_mode != "zeros":
-            return F.conv2d(
-                F.pad(
-                    input,
-                    self._reversed_padding_repeated_twice,
-                    mode=self.padding_mode,
-                ),
-                weight,
-                self.get_bias(),
-                self.stride,
-                _pair(0),
-                self.dilation,
-                self.groups,
-            )
-        return F.conv2d(
-            input,
-            weight,
-            self.get_bias(),
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-        )
-
-    def forward(self, input: Tensor) -> Tensor:
-        return self._conv_forward(input, self.get_weight())
-
-
-class ActivationBalancer(torch.nn.Module):
-    """
-    Modifies the backpropped derivatives of a function to try to encourage, for
-    each channel, that it is positive at least a proportion `threshold` of the
-    time.  It does this by multiplying negative derivative values by up to
-    (1+max_factor), and positive derivative values by up to (1-max_factor),
-    interpolated from 1 at the threshold to those extremal values when none
-    of the inputs are positive.
-
-
-    Args:
-           channel_dim: the dimension/axis corresponding to the channel, e.g.
-               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
-           min_positive: the minimum, per channel, of the proportion of the time
-               that (x > 0), below which we start to modify the derivatives.
-           max_positive: the maximum, per channel, of the proportion of the time
-               that (x > 0), above which we start to modify the derivatives.
-           max_factor: the maximum factor by which we modify the derivatives for
-              either the sign constraint or the magnitude constraint;
-              e.g. with max_factor=0.02, the the derivatives would be multiplied by
-              values in the range [0.98..1.02].
-           min_abs:  the minimum average-absolute-value per channel, which
-              we allow, before we start to modify the derivatives to prevent
-              this.
-           max_abs:  the maximum average-absolute-value per channel, which
-               we allow, before we start to modify the derivatives to prevent
-               this.
-    """
-
-    def __init__(
-        self,
-        channel_dim: int,
-        min_positive: float = 0.05,
-        max_positive: float = 0.95,
-        max_factor: float = 0.01,
-        min_abs: float = 0.2,
-        max_abs: float = 100.0,
-    ):
-        super(ActivationBalancer, self).__init__()
-        self.channel_dim = channel_dim
-        self.min_positive = min_positive
-        self.max_positive = max_positive
-        self.max_factor = max_factor
-        self.min_abs = min_abs
-        self.max_abs = max_abs
-
-    def forward(self, x: Tensor) -> Tensor:
-        return ActivationBalancerFunction.apply(
-            x,
-            self.channel_dim,
-            self.min_positive,
-            self.max_positive,
-            self.max_factor,
-            self.min_abs,
-            self.max_abs,
-        )
-
-
-class DoubleSwishFunction(torch.autograd.Function):
-    """
-      double_swish(x) = x * torch.sigmoid(x-1)
-    This is a definition, originally motivated by its close numerical
-    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
-
-    Memory-efficient derivative computation:
-     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
-     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
-     Now, s'(x) = s(x) * (1-s(x)).
-     double_swish'(x) =  x * s'(x) + s(x).
-                      =  x * s(x) * (1-s(x)) + s(x).
-                     = double_swish(x) * (1-s(x)) + s(x)
-     ... so we just need to remember s(x) but not x itself.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        x = x.detach()
-        s = torch.sigmoid(x - 1.0)
-        y = x * s
-        ctx.save_for_backward(s, y)
-        return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        s, y = ctx.saved_tensors
-        return (y * (1 - s) + s) * y_grad
-
-
-class DoubleSwish(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
-        that we approximate closely with x * sigmoid(x-1).
-        """
-        return DoubleSwishFunction.apply(x)
-
-
-class ScaledEmbedding(nn.Module):
-    r"""This is a modified version of nn.Embedding that introduces a learnable scale
-    on the parameters.  Note: due to how we initialize it, it's best used with
-    schedulers like Noam that have a warmup period.
-
-    It is a simple lookup table that stores embeddings of a fixed dictionary and size.
-
-    This module is often used to store word embeddings and retrieve them using indices.
-    The input to the module is a list of indices, and the output is the corresponding
-    word embeddings.
-
-    Args:
-        num_embeddings (int): size of the dictionary of embeddings
-        embedding_dim (int): the size of each embedding vector
-        padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
-                                         (initialized to zeros) whenever it encounters the index.
-        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
-                                    is renormalized to have norm :attr:`max_norm`.
-        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
-        scale_grad_by_freq (boolean, optional): If given, this will scale gradients by the inverse of frequency of
-                                                the words in the mini-batch. Default ``False``.
-        sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
-                                 See Notes for more details regarding sparse gradients.
-
-        initial_speed (float, optional):  This affects how fast the parameter will
-           learn near the start of training; you can set it to a value less than
-           one if you suspect that a module is contributing to instability near
-           the start of training.  Nnote: regardless of the use of this option,
-           it's best to use schedulers like Noam that have a warm-up period.
-           Alternatively you can set it to more than 1 if you want it to
-           initially train faster.  Must be greater than 0.
-
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
-                         initialized from :math:`\mathcal{N}(0, 1)`
-
-    Shape:
-        - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
-        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
-
-    .. note::
-        Keep in mind that only a limited number of optimizers support
-        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
-        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
-
-    .. note::
-        With :attr:`padding_idx` set, the embedding vector at
-        :attr:`padding_idx` is initialized to all zeros. However, note that this
-        vector can be modified afterwards, e.g., using a customized
-        initialization method, and thus changing the vector used to pad the
-        output. The gradient for this vector from :class:`~torch.nn.Embedding`
-        is always zero.
-
-    Examples::
-
-        >>> # an Embedding module containing 10 tensors of size 3
-        >>> embedding = nn.Embedding(10, 3)
-        >>> # a batch of 2 samples of 4 indices each
-        >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
-        >>> embedding(input)
-        tensor([[[-0.0251, -1.6902,  0.7172],
-                 [-0.6431,  0.0748,  0.6969],
-                 [ 1.4970,  1.3448, -0.9685],
-                 [-0.3677, -2.7265, -0.1685]],
-
-                [[ 1.4970,  1.3448, -0.9685],
-                 [ 0.4362, -0.4004,  0.9400],
-                 [-0.6431,  0.0748,  0.6969],
-                 [ 0.9124, -2.3616,  1.1151]]])
-
-
-        >>> # example with padding_idx
-        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
-        >>> input = torch.LongTensor([[0,2,0,5]])
-        >>> embedding(input)
-        tensor([[[ 0.0000,  0.0000,  0.0000],
-                 [ 0.1535, -2.0309,  0.9315],
-                 [ 0.0000,  0.0000,  0.0000],
-                 [-0.1655,  0.9897,  0.0635]]])
-
-    """
-    __constants__ = [
-        "num_embeddings",
-        "embedding_dim",
-        "padding_idx",
-        "scale_grad_by_freq",
-        "sparse",
-    ]
-
-    num_embeddings: int
-    embedding_dim: int
-    padding_idx: int
-    scale_grad_by_freq: bool
-    weight: Tensor
-    sparse: bool
-
-    def __init__(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        padding_idx: Optional[int] = None,
-        scale_grad_by_freq: bool = False,
-        sparse: bool = False,
-        initial_speed: float = 1.0,
-    ) -> None:
-        super(ScaledEmbedding, self).__init__()
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if padding_idx is not None:
-            if padding_idx > 0:
-                assert (
-                    padding_idx < self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-            elif padding_idx < 0:
-                assert (
-                    padding_idx >= -self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-                padding_idx = self.num_embeddings + padding_idx
-        self.padding_idx = padding_idx
-        self.scale_grad_by_freq = scale_grad_by_freq
-
-        self.scale = nn.Parameter(torch.zeros(()))  # see reset_parameters()
-        self.sparse = sparse
-
-        self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim))
-        self.reset_parameters(initial_speed)
-
-    def reset_parameters(self, initial_speed: float = 1.0) -> None:
-        std = 0.1 / initial_speed
-        nn.init.normal_(self.weight, std=std)
-        nn.init.constant_(self.scale, torch.tensor(1.0 / std).log())
-
-        if self.padding_idx is not None:
-            with torch.no_grad():
-                self.weight[self.padding_idx].fill_(0)
-
-    def forward(self, input: Tensor) -> Tensor:
-        F = torch.nn.functional
-        scale = self.scale.exp()
-        if input.numel() < self.num_embeddings:
-            return (
-                F.embedding(
-                    input,
-                    self.weight,
-                    self.padding_idx,
-                    None,
-                    2.0,  # None, 2.0 relate to normalization
-                    self.scale_grad_by_freq,
-                    self.sparse,
-                )
-                * scale
-            )
-        else:
-            return F.embedding(
-                input,
-                self.weight * scale,
-                self.padding_idx,
-                None,
-                2.0,  # None, 2.0 relates to normalization
-                self.scale_grad_by_freq,
-                self.sparse,
-            )
-
-    def extra_repr(self) -> str:
-        s = "{num_embeddings}, {embedding_dim}, scale={scale}"
-        if self.padding_idx is not None:
-            s += ", padding_idx={padding_idx}"
-        if self.scale_grad_by_freq is not False:
-            s += ", scale_grad_by_freq={scale_grad_by_freq}"
-        if self.sparse is not False:
-            s += ", sparse=True"
-        return s.format(**self.__dict__)
-
-
-def _test_activation_balancer_sign():
-    probs = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = 1.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))
-    x = x.detach()
-    x.requires_grad = True
-    m = ActivationBalancer(
-        channel_dim=0,
-        min_positive=0.05,
-        max_positive=0.95,
-        max_factor=0.2,
-        min_abs=0.0,
-    )
-
-    y_grad = torch.sign(torch.randn(probs.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_activation_balancer_sign: x = ", x)
-    print("_test_activation_balancer_sign: y grad = ", y_grad)
-    print("_test_activation_balancer_sign: x grad = ", x.grad)
-
-
-def _test_activation_balancer_magnitude():
-    magnitudes = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(
-        -1
-    )
-    x = x.detach()
-    x.requires_grad = True
-    m = ActivationBalancer(
-        channel_dim=0,
-        min_positive=0.0,
-        max_positive=1.0,
-        max_factor=0.2,
-        min_abs=0.2,
-        max_abs=0.8,
-    )
-
-    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_activation_balancer_magnitude: x = ", x)
-    print("_test_activation_balancer_magnitude: y grad = ", y_grad)
-    print("_test_activation_balancer_magnitude: x grad = ", x.grad)
-
-
-def _test_basic_norm():
-    num_channels = 128
-    m = BasicNorm(num_channels=num_channels, channel_dim=1)
-
-    x = torch.randn(500, num_channels)
-
-    y = m(x)
-
-    assert y.shape == x.shape
-    x_rms = (x ** 2).mean().sqrt()
-    y_rms = (y ** 2).mean().sqrt()
-    print("x rms = ", x_rms)
-    print("y rms = ", y_rms)
-    assert y_rms < x_rms
-    assert y_rms > 0.5 * x_rms
-
-
-def _test_double_swish_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 0.5
-    x.requires_grad = True
-    m = DoubleSwish()
-    torch.autograd.gradcheck(m, x)
-
-
-if __name__ == "__main__":
-    _test_activation_balancer_sign()
-    _test_activation_balancer_magnitude()
-    _test_basic_norm()
-    _test_double_swish_deriv()
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/scaling.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/scaling.py
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py`