diff --git a/egs/aishell/ASR/decode.sh b/egs/aishell/ASR/decode.sh
deleted file mode 100644
index 31fe95ecb..000000000
--- a/egs/aishell/ASR/decode.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="2,3"
-#pip install -r seamlessm4t/requirements.txt
-#pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
-export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
-python3 seamlessm4t/decode.py --epoch 5 --exp-dir seamlessm4t/exp
-python3 seamlessm4t/decode.py --epoch 5 --avg 2 --exp-dir seamlessm4t/exp
diff --git a/egs/aishell/ASR/decode_whisper.sh b/egs/aishell/ASR/decode_whisper.sh
deleted file mode 100644
index 852359b69..000000000
--- a/egs/aishell/ASR/decode_whisper.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="1"
-#pip install -r whisper/requirements.txt
-#pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-#export PYTHONPATH=$PYTHONPATH:/mnt/samsung-t7/yuekai/asr/icefall/
-
-python3 whisper/decode.py --exp-dir whisper/exp --max-duration 100
diff --git a/egs/aishell/ASR/run.sh b/egs/aishell/ASR/run.sh
deleted file mode 100644
index 7ab85dc84..000000000
--- a/egs/aishell/ASR/run.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="2,3"
-pip install -r seamlessm4t/requirements.txt
-pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
-export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
-torchrun --nproc-per-node 8 seamlessm4t/train.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp_new_vocab --start-epoch 1
diff --git a/egs/aishell/ASR/run_whisper.sh b/egs/aishell/ASR/run_whisper.sh
deleted file mode 100644
index f97e44af2..000000000
--- a/egs/aishell/ASR/run_whisper.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-pip install -r whisper/requirements.txt
-export PYTHONPATH=$PYTHONPATH:/workspace/icefall
-#export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-#export PYTHONPATH=$PYTHONPATH:/mnt/samsung-t7/yuekai/asr/icefall
-
-torchrun --nproc-per-node 8 whisper/train.py --use-fp16 1 --max-duration 20 --base-lr 1e-5 --exp-dir whisper/exp_medimum --start-epoch 1
diff --git a/egs/aishell/ASR/seamlessm4t/asr_datamodule.py b/egs/aishell/ASR/seamlessm4t/asr_datamodule.py
deleted file mode 120000
index fa1b8cca3..000000000
--- a/egs/aishell/ASR/seamlessm4t/asr_datamodule.py
+++ /dev/null
@@ -1 +0,0 @@
-../tdnn_lstm_ctc/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/aishell/ASR/seamlessm4t/decode.py b/egs/aishell/ASR/seamlessm4t/decode.py
deleted file mode 100755
index 43e5b9b7b..000000000
--- a/egs/aishell/ASR/seamlessm4t/decode.py
+++ /dev/null
@@ -1,415 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
-#                                            Fangjun Kuang,
-#                                            Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import torch
-import torch.nn as nn
-from asr_datamodule import AishellAsrDataModule
-#from conformer import Conformer
-
-from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint, average_checkpoints_with_averaged_model
-from icefall.decode import (
-    get_lattice,
-    nbest_decoding,
-    nbest_oracle,
-    one_best_decoding,
-    rescore_with_attention_decoder,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    get_texts,
-    setup_logger,
-    store_transcripts,
-    write_error_stats,
-)
-
-from seamless_communication.models.unity import (
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-)
-from fairseq2.generation import (
-    SequenceGeneratorOptions,
-    SequenceToTextGenerator,
-)
-from seamless_communication.models.unity.model import UnitYX2TModel
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=-1,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=1,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="beam-search",
-        help="""Decoding method.
-        Supported values are:
-            - (0) ctc-decoding. Use CTC decoding. It maps the tokens ids to
-              tokens using token symbol tabel directly.
-            - (1) 1best. Extract the best path from the decoding lattice as the
-              decoding result.
-            - (2) nbest. Extract n paths from the decoding lattice; the path
-              with the highest score is the decoding result.
-            - (3) attention-decoder. Extract n paths from the lattice,
-              the path with the highest score is the decoding result.
-            - (4) nbest-oracle. Its WER is the lower bound of any n-best
-              rescoring method can achieve. Useful for debugging n-best
-              rescoring method.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="seamlessm4t/exp",
-        help="The experiment dir",
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "subsampling_factor": 4,
-            "feature_dim": 80,
-            "nhead": 4,
-            "attention_dim": 512,
-            "num_encoder_layers": 12,
-            "num_decoder_layers": 6,
-            "vgg_frontend": False,
-            "use_feat_batchnorm": True,
-            # parameters for decoder
-            "search_beam": 20,
-            "output_beam": 7,
-            "min_active_states": 30,
-            "max_active_states": 10000,
-            "use_double_scores": True,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-    batch: dict,
-) -> Dict[str, List[List[int]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if decoding method is 1best, the key is the string `no_rescore`.
-               If attention rescoring is used, the key is the string
-               `ngram_lm_scale_xxx_attention_scale_xxx`, where `xxx` is the
-               value of `lm_scale` and `attention_scale`. An example key is
-               `ngram_lm_scale_0.7_attention_scale_0.5`
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-
-        - params.method is "1best", it uses 1best decoding without LM rescoring.
-        - params.method is "nbest", it uses nbest decoding without LM rescoring.
-        - params.method is "attention-decoder", it uses attention rescoring.
-
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID of the SOS.
-      eos_id:
-        The token ID of the EOS.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    dtype = torch.float16
-    device = torch.device("cuda", 3)
-
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-    feature = feature.to(device, dtype=dtype)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-    feature_len = supervisions["num_frames"]
-    feature_len = feature_len.to(device, dtype=dtype)
-
-    text_output = s2t_generator.generate_ex(feature, feature_len)
-    sentences = text_output.sentences
-    hyps = [sentence.bytes().decode("utf-8").split() for sentence in sentences]
-    key = "beam-search"
-
-    return {key: hyps}
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID for SOS.
-      eos_id:
-        The token ID for EOS.
-    Returns:
-      Return a dict, whose key may be "no-rescore" if the decoding method is
-      1best or it may be "ngram_lm_scale_0.7_attention_scale_0.5" if attention
-      rescoring is used. Its value is a list of tuples. Each tuple contains two
-      elements: The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    results = []
-
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            s2t_generator=s2t_generator,
-            batch=batch,
-        )
-
-        for lm_scale, hyps in hyps_dict.items():
-            this_batch = []
-            assert len(hyps) == len(texts)
-            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
-                ref_words = ref_text.split()
-                this_batch.append((cut_id, ref_words, hyp_words))
-
-            results[lm_scale].extend(this_batch)
-
-        num_cuts += len(batch["supervisions"]["text"])
-
-        if batch_idx % 100 == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-
-    enable_log = True
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
-        if enable_log:
-            logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        # we compute CER for aishell dataset.
-        results_char = []
-        for res in results:
-            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
-            )
-            test_set_wers[key] = wer
-
-        if enable_log:
-            logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
-    with open(errs_info, "w") as f:
-        print("settings\tCER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    AishellAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-    logging.info(params)
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 3)
-
-    logging.info(f"device: {device}")
-    dtype = torch.float16
-    
-    model_name_or_card = "seamlessM4T_medium"
-    #model_name_or_card = "seamlessM4T_large"
-    model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
-    del model.t2u_model
-    del model.text_encoder
-    del model.text_encoder_frontend
-    if params.epoch > 0:
-      if params.avg > 1:
-        start = params.epoch - params.avg
-        assert start >= 1, start
-        filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-        filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-        logging.info(
-            f"Calculating the averaged model over epoch range from "
-            f"{start} (excluded) to {params.epoch}"
-        )
-        model.to(device)
-        model.load_state_dict(
-            average_checkpoints_with_averaged_model(
-                filename_start=filename_start,
-                filename_end=filename_end,
-                device=device,
-            )
-        )
-      else:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    model.to(device)
-    model.eval()
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-
-    text_max_len_a = 1
-    text_max_len_b = 200
-    target_lang = "cmn"
-
-    text_opts = SequenceGeneratorOptions(
-        beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
-    )
-
-    s2t_model = UnitYX2TModel(
-        encoder_frontend=model.speech_encoder_frontend,
-        encoder=model.speech_encoder,
-        decoder_frontend=model.text_decoder_frontend,
-        decoder=model.text_decoder,
-        final_proj=model.final_proj,
-        pad_idx=model.pad_idx,
-    )
-    s2t_generator = SequenceToTextGenerator(
-        s2t_model, text_tokenizer, target_lang, text_opts
-    )
-    # we need cut ids to display recognition results.
-    args.return_cuts = True
-    aishell = AishellAsrDataModule(args)
-    test_cuts = aishell.test_cuts()
-    test_dl = aishell.test_dataloaders(test_cuts)
-
-    test_sets = ["test"]
-    test_dls = [test_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dls):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            s2t_generator=s2t_generator,
-        )
-
-        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
-    logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/aishell/ASR/seamlessm4t/decode2.py b/egs/aishell/ASR/seamlessm4t/decode2.py
deleted file mode 100644
index 4607bae07..000000000
--- a/egs/aishell/ASR/seamlessm4t/decode2.py
+++ /dev/null
@@ -1,432 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
-#                                            Fangjun Kuang,
-#                                            Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import torch
-import torch.nn as nn
-from asr_datamodule import AishellAsrDataModule
-#from conformer import Conformer
-from tokenizer import CharTokenizer
-from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint, average_checkpoints_with_averaged_model
-from icefall.decode import (
-    get_lattice,
-    nbest_decoding,
-    nbest_oracle,
-    one_best_decoding,
-    rescore_with_attention_decoder,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    get_texts,
-    setup_logger,
-    store_transcripts,
-    write_error_stats,
-)
-
-from seamless_communication.models.unity import (
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-)
-from fairseq2.generation import (
-    SequenceGeneratorOptions,
-    SequenceToTextGenerator,
-)
-from seamless_communication.models.unity.model import UnitYX2TModel
-from fairseq2.nn.embedding import Embedding
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=-1,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=1,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="beam-search",
-        help="""Decoding method.
-        Supported values are:
-            - (0) ctc-decoding. Use CTC decoding. It maps the tokens ids to
-              tokens using token symbol tabel directly.
-            - (1) 1best. Extract the best path from the decoding lattice as the
-              decoding result.
-            - (2) nbest. Extract n paths from the decoding lattice; the path
-              with the highest score is the decoding result.
-            - (3) attention-decoder. Extract n paths from the lattice,
-              the path with the highest score is the decoding result.
-            - (4) nbest-oracle. Its WER is the lower bound of any n-best
-              rescoring method can achieve. Useful for debugging n-best
-              rescoring method.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="seamlessm4t/exp",
-        help="The experiment dir",
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "subsampling_factor": 4,
-            "feature_dim": 80,
-            "nhead": 4,
-            "attention_dim": 512,
-            "num_encoder_layers": 12,
-            "num_decoder_layers": 6,
-            "vgg_frontend": False,
-            "use_feat_batchnorm": True,
-            # parameters for decoder
-            "search_beam": 20,
-            "output_beam": 7,
-            "min_active_states": 30,
-            "max_active_states": 10000,
-            "use_double_scores": True,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-    batch: dict,
-) -> Dict[str, List[List[int]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if decoding method is 1best, the key is the string `no_rescore`.
-               If attention rescoring is used, the key is the string
-               `ngram_lm_scale_xxx_attention_scale_xxx`, where `xxx` is the
-               value of `lm_scale` and `attention_scale`. An example key is
-               `ngram_lm_scale_0.7_attention_scale_0.5`
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-
-        - params.method is "1best", it uses 1best decoding without LM rescoring.
-        - params.method is "nbest", it uses nbest decoding without LM rescoring.
-        - params.method is "attention-decoder", it uses attention rescoring.
-
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID of the SOS.
-      eos_id:
-        The token ID of the EOS.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    dtype = torch.float16
-    device = torch.device("cuda", 3)
-
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-    feature = feature.to(device, dtype=dtype)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-    feature_len = supervisions["num_frames"]
-    feature_len = feature_len.to(device, dtype=dtype)
-
-    text_output = s2t_generator.generate_ex(feature, feature_len)
-    #sentences = text_output.sentences
-    #hyps = [sentence.bytes().decode("utf-8").split() for sentence in sentences]
-
-    token_ids = text_output.generator_output.results
-    hyps_ids = [sentence[0].seq.cpu().tolist() for sentence in token_ids]
-    hyps = [params.tokenizer.decode(hyps_id).split() for hyps_id in hyps_ids]
-
-    key = "beam-search"
-
-    return {key: hyps}
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID for SOS.
-      eos_id:
-        The token ID for EOS.
-    Returns:
-      Return a dict, whose key may be "no-rescore" if the decoding method is
-      1best or it may be "ngram_lm_scale_0.7_attention_scale_0.5" if attention
-      rescoring is used. Its value is a list of tuples. Each tuple contains two
-      elements: The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    results = []
-
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            s2t_generator=s2t_generator,
-            batch=batch,
-        )
-
-        for lm_scale, hyps in hyps_dict.items():
-            this_batch = []
-            assert len(hyps) == len(texts)
-            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
-                ref_words = ref_text.split()
-                this_batch.append((cut_id, ref_words, hyp_words))
-
-            results[lm_scale].extend(this_batch)
-
-        num_cuts += len(batch["supervisions"]["text"])
-
-        if batch_idx % 100 == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-
-    enable_log = True
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
-        if enable_log:
-            logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        # we compute CER for aishell dataset.
-        results_char = []
-        for res in results:
-            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
-            )
-            test_set_wers[key] = wer
-
-        if enable_log:
-            logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
-    with open(errs_info, "w") as f:
-        print("settings\tCER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    AishellAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.tokenizer = CharTokenizer('./seamlessm4t/tokens.txt')
-    params.update(vars(args))
-    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-    logging.info(params)
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 3)
-
-    logging.info(f"device: {device}")
-    dtype = torch.float16
-    
-    model_name_or_card = "seamlessM4T_medium"
-    #model_name_or_card = "seamlessM4T_large"
-    model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
-    del model.t2u_model
-    del model.text_encoder
-    del model.text_encoder_frontend
-    model.text_decoder_frontend.embed = nn.Embedding(num_embeddings=params.tokenizer.vocab_size, embedding_dim=1024 ,padding_idx=0)
-    #model.text_decoder_frontend.embed = Embedding(num_embeddings=params.tokenizer.vocab_size, embedding_dim=1024 ,pad_idx=0, scaled=True)
-    model.final_proj = nn.Linear(1024, params.tokenizer.vocab_size, bias=False)
-    #model.final_proj = nn.Linear(1024, params.tokenizer.vocab_size)
-    if params.epoch > 0:
-      if params.avg > 1:
-        start = params.epoch - params.avg
-        assert start >= 1, start
-        filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-        filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-        logging.info(
-            f"Calculating the averaged model over epoch range from "
-            f"{start} (excluded) to {params.epoch}"
-        )
-        model.to(device)
-        model.load_state_dict(
-            average_checkpoints_with_averaged_model(
-                filename_start=filename_start,
-                filename_end=filename_end,
-                device=device,
-            )
-        )
-      else:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    model.to(device)
-    model.eval()
-    model.half()
-    #for param in model.parameters():
-    #    if param.dtype == torch.float16:
-    #        pass
-    #    else:
-    #        param.data = param.data.to(torch.float16)
-            #print(param)
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-
-    text_max_len_a = 1
-    text_max_len_b = 200
-    target_lang = "cmn"
-
-    text_opts = SequenceGeneratorOptions(
-        beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
-    )
-
-    s2t_model = UnitYX2TModel(
-        encoder_frontend=model.speech_encoder_frontend,
-        encoder=model.speech_encoder,
-        decoder_frontend=model.text_decoder_frontend,
-        decoder=model.text_decoder,
-        final_proj=model.final_proj,
-        pad_idx=model.pad_idx,
-    )
-    s2t_generator = SequenceToTextGenerator(
-        s2t_model, text_tokenizer, target_lang, text_opts
-    )
-    # we need cut ids to display recognition results.
-    args.return_cuts = True
-    aishell = AishellAsrDataModule(args)
-    test_cuts = aishell.test_cuts()
-    test_dl = aishell.test_dataloaders(test_cuts)
-
-    test_sets = ["test"]
-    test_dls = [test_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dls):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            s2t_generator=s2t_generator,
-        )
-
-        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
-    logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/aishell/ASR/seamlessm4t/label_smoothing.py b/egs/aishell/ASR/seamlessm4t/label_smoothing.py
deleted file mode 120000
index e9d239fff..000000000
--- a/egs/aishell/ASR/seamlessm4t/label_smoothing.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/conformer_ctc/label_smoothing.py
\ No newline at end of file
diff --git a/egs/aishell/ASR/seamlessm4t/model.py b/egs/aishell/ASR/seamlessm4t/model.py
deleted file mode 100644
index efe18d5ff..000000000
--- a/egs/aishell/ASR/seamlessm4t/model.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import torch
-import torch.nn as nn
-from fairseq2.nn.embedding import Embedding
-from seamless_communication.models.inference import Translator
-from seamless_communication.models.unity import (
-    UnitTokenizer,
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-    load_unity_unit_tokenizer,
-)
-from fairseq2.generation import (
-    Seq2SeqGenerator,
-    SequenceGeneratorOptions,
-    SequenceGeneratorOutput,
-    SequenceToTextGenerator,
-    SequenceToTextOutput,
-)
-from seamless_communication.models.unity.model import UnitYModel, UnitYX2TModel
-
-import torchaudio
-import torchaudio.compliance.kaldi as ta_kaldi
-audio_file="/mnt/samsung-t7/yuekai/asr/Triton-ASR-Client/datasets/mini_en/wav/1089-134686-0001.wav"
-src_lang="cmn"
-
-audio_file="/mnt/samsung-t7/yuekai/asr/Triton-ASR-Client/datasets/mini_zh/wav/long.wav"
-src_lang="eng"
-target_lang = "cmn"
-
-audio_input = torchaudio.load(audio_file)[0]
-feature = ta_kaldi.fbank(audio_input, num_mel_bins=80)
-# feature shape is (T, F), convert it to (B, T, F), source_seq_lens tracks T 
-source_seqs = feature.unsqueeze(0)
-source_seq_lens = torch.tensor([feature.shape[0]])
-
-# Initialize a Translator object with a multitask model, vocoder on the GPU.
-
-
-# translator = Translator("seamlessM4T_medium", vocoder_name_or_card="vocoder_36langs", device=torch.device("cuda:2"), dtype=torch.float16)
-
-# transcribed_text, _, _ = translator.predict(audio_file, "asr", src_lang)
-
-# print(transcribed_text)
-
-
-model_name_or_card = "seamlessM4T_medium"
-device = torch.device("cuda:3")
-
-# cast source_seq_lens, source_seqs to device, dtype to torch.float16
-source_seq_lens = source_seq_lens.to(device=device, dtype=torch.float16)
-source_seqs = source_seqs.to(device=device, dtype=torch.float16)
-
-
-
-dtype = torch.float16
-model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
-model.eval()
-model.text_decoder_frontend.embed = Embedding(num_embeddings=6257, embedding_dim=1024 ,pad_idx=0, scaled=True)
-model.final_proj = nn.Linear(1024, 6257)
-model.half()
-print(model.text_decoder_frontend.embed, model.text_encoder_frontend.embed.weight.dtype, type(model.text_encoder_frontend.embed), type(model.text_encoder_frontend.embed.weight))
-print(model.final_proj, model.final_proj.weight.dtype, type(model.final_proj), type(model.final_proj.weight))
-#input()
-exit(0)
-text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-#print(text_tokenizer.model.eos_idx, text_tokenizer.model.pad_idx)
-#text_tokenizer_encoder = text_tokenizer.create_encoder(lang=target_lang, mode="target")
-#text_tokenizer_decoder = text_tokenizer.create_decoder()
-# print attritbut of text_tokenizer_encoder
-#print(text_tokenizer.vocab_info)
-#print(text_tokenizer_encoder("其中广州深圳甚至出现了多个日光盘"))
-#print(text_tokenizer_decoder(torch.tensor([3,256200,137139,252603,250476,250590,1,84778,148897,249568,249352,249947,249050,250520,254508])))
-
-# store all vocab in a file
-# with open("vocab.txt", "w") as f:
-#     for i in range(256206):
-#         f.write(f"{i}: " + text_tokenizer_decoder(torch.tensor([i]))[0].bytes().decode("utf-8")+ "\n")
-#     f.close()
-# exit(0)
-
-
-
-# def decode(
-#     self,
-#     seqs: Tensor,
-#     seq_lens: Optional[Tensor],
-#     encoder_output: Tensor,
-#     encoder_padding_mask: Optional[Tensor],
-#     state_bag: Optional[IncrementalStateBag] = None,
-# ) -> Tuple[Tensor, Optional[Tensor]]:
-#     seqs, padding_mask = self.text_decoder_frontend(seqs, seq_lens, state_bag)
-
-#     return self.text_decoder(  # type: ignore[no-any-return]
-#         seqs, padding_mask, encoder_output, encoder_padding_mask, state_bag
-#     )
-
-# def decoding(model, feature):
-#     seqs, padding_mask = model.speech_encoder_frontend(seqs, seq_lens)
-#     speech_encoder(seqs, padding_mask)
-
-#     decoder_output, decoder_padding_mask = self.decode(
-#         batch.target_seqs,
-#         batch.target_seq_lens,
-#         encoder_output,
-#         encoder_padding_mask,
-#     )
-
-#     text_logits = model.final_project(decoder_output, decoder_padding_mask)
-
-text_max_len_a = 1
-text_max_len_b = 200
-
-text_opts = SequenceGeneratorOptions(
-    beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
-)
-
-s2t_model = UnitYX2TModel(
-    encoder_frontend=model.speech_encoder_frontend,
-    encoder=model.speech_encoder,
-    decoder_frontend=model.text_decoder_frontend,
-    decoder=model.text_decoder,
-    final_proj=model.final_proj,
-    pad_idx=model.pad_idx,
-)
-s2t_generator = SequenceToTextGenerator(
-    s2t_model, text_tokenizer, target_lang, text_opts
-)
-
-text_output = s2t_generator.generate_ex(source_seqs, source_seq_lens)
-print(text_output.generator_output.results[0][0].seq.cpu().tolist())
-# sentence = text_output.sentences[0]
-# print(sentence, type(sentence))
-# sentence = sentence.bytes().decode("utf-8")
diff --git a/egs/aishell/ASR/seamlessm4t/optim.py b/egs/aishell/ASR/seamlessm4t/optim.py
deleted file mode 100644
index abfb2092c..000000000
--- a/egs/aishell/ASR/seamlessm4t/optim.py
+++ /dev/null
@@ -1,1173 +0,0 @@
-#      Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import logging
-import random
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-from lhotse.utils import fix_random_seed
-from torch import Tensor
-from torch.optim import Optimizer
-
-
-class BatchedOptimizer(Optimizer):
-    """
-    This class adds to class Optimizer the capability to optimize parameters in batches:
-    it will stack the parameters and their grads for you so the optimizer can work
-    on tensors with an extra leading dimension.  This is intended for speed with GPUs,
-    as it reduces the number of kernels launched in the optimizer.
-
-    Args:
-      params:
-    """
-
-    def __init__(self, params, defaults):
-        super(BatchedOptimizer, self).__init__(params, defaults)
-
-    @contextlib.contextmanager
-    def batched_params(self, param_group, group_params_names):
-        """
-        This function returns (technically, yields) a list of
-          of tuples (p, state), where
-        p is a `fake` parameter that is stacked (over axis 0) from real parameters
-        that share the same shape, and its gradient is also stacked;
-        `state` is the state corresponding to this batch of parameters
-        (it will be physically located in the "state" for one of the real
-        parameters, the last one that has any particular shape and dtype).
-
-        This function is decorated as a context manager so that it can
-        write parameters back to their "real" locations.
-
-        The idea is, instead of doing:
-        <code>
-          for p in group["params"]:
-             state = self.state[p]
-             ...
-        </code>
-        you can do:
-        <code>
-          with self.batched_params(group["params"]) as batches:
-             for p, state, p_names in batches:
-                 ...
-        </code>
-
-        Args:
-          group: a parameter group, which is a list of parameters; should be
-                one of self.param_groups.
-          group_params_names: name for each parameter in group,
-                which is List[str].
-        """
-        batches = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
-        batches_names = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
-
-        assert len(param_group) == len(group_params_names)
-        for p, named_p in zip(param_group, group_params_names):
-            key = (str(p.dtype), *p.shape)
-            batches[key].append(p)
-            batches_names[key].append(named_p)
-
-        batches_names_keys = list(batches_names.keys())
-        sorted_idx = sorted(
-            range(len(batches_names)), key=lambda i: batches_names_keys[i]
-        )
-        batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
-        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
-
-        stacked_params_dict = dict()
-
-        # turn batches into a list, in deterministic order.
-        # tuples will contain tuples of (stacked_param, state, stacked_params_names),
-        # one for each batch in `batches`.
-        tuples = []
-
-        for batch, batch_names in zip(batches, batches_names):
-            p = batch[0]
-            # we arbitrarily store the state in the
-            # state corresponding to the 1st parameter in the
-            # group.  class Optimizer will take care of saving/loading state.
-            state = self.state[p]
-            p_stacked = torch.stack(batch)
-            grad = torch.stack(
-                [torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
-            )
-            p_stacked.grad = grad
-            stacked_params_dict[key] = p_stacked
-            tuples.append((p_stacked, state, batch_names))
-
-        yield tuples  # <-- calling code will do the actual optimization here!
-
-        for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
-            for i, p in enumerate(batch):  # batch is list of Parameter
-                p.copy_(stacked_params[i])
-
-
-class ScaledAdam(BatchedOptimizer):
-    """
-     Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
-     proportional to the norm of that parameter; and also learn the scale of the parameter,
-     in log space, subject to upper and lower limits (as if we had factored each parameter as
-     param = underlying_param * log_scale.exp())
-
-
-     Args:
-          params:  The parameters or param_groups to optimize (like other Optimizer subclasses)
-                   Unlike common optimizers, which accept model.parameters() or groups of parameters(),
-                   this optimizer could accept model.named_parameters() or groups of named_parameters().
-                   See comments of function _get_names_of_parameters for its 4 possible cases.
-              lr:  The learning rate.  We will typically use a learning rate schedule that starts
-                   at 0.03 and decreases over time, i.e. much higher than other common
-                   optimizers.
-     clipping_scale: (e.g. 2.0)
-                   A scale for gradient-clipping: if specified, the normalized gradients
-                   over the whole model will be clipped to have 2-norm equal to
-                   `clipping_scale` times the median 2-norm over the most recent period
-                   of `clipping_update_period` minibatches.  By "normalized gradients",
-                   we mean after multiplying by the rms parameter value for this tensor
-                   [for non-scalars]; this is appropriate because our update is scaled
-                   by this quantity.
-            betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
-                   Must satisfy 0 < beta <= beta2 < 1.
-     scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
-                   scale of each parameter tensor and scalar parameters of the mode..
-                   If each parameter were decomposed
-                   as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
-                   would be a the scaling factor on the learning rate of p_scale.
-              eps:  A general-purpose epsilon to prevent division by zero
-    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be >= this value)
-    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be <= this value)
-       scalar_max: Maximum absolute value for scalar parameters (applicable if your
-                   model has any parameters with numel() == 1).
-    size_update_period: The periodicity, in steps, with which we update the size (scale)
-                   of the parameter tensor.  This is provided to save a little time
-                   in the update.
-     clipping_update_period: if clipping_scale is specified, this is the period
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=3e-02,
-        clipping_scale=None,
-        betas=(0.9, 0.98),
-        scalar_lr_scale=0.1,
-        eps=1.0e-08,
-        param_min_rms=1.0e-05,
-        param_max_rms=3.0,
-        scalar_max=10.0,
-        size_update_period=4,
-        clipping_update_period=100,
-    ):
-
-        defaults = dict(
-            lr=lr,
-            clipping_scale=clipping_scale,
-            betas=betas,
-            scalar_lr_scale=scalar_lr_scale,
-            eps=eps,
-            param_min_rms=param_min_rms,
-            param_max_rms=param_max_rms,
-            scalar_max=scalar_max,
-            size_update_period=size_update_period,
-            clipping_update_period=clipping_update_period,
-        )
-
-        # If params only contains parameters or group of parameters,
-        # i.e when parameter names are not given,
-        # this flag will be set to False in funciton _get_names_of_parameters.
-        self.show_dominant_parameters = True
-        param_groups, parameters_names = self._get_names_of_parameters(params)
-        super(ScaledAdam, self).__init__(param_groups, defaults)
-        assert len(self.param_groups) == len(parameters_names)
-        self.parameters_names = parameters_names
-
-    def _get_names_of_parameters(
-        self, params_or_named_params
-    ) -> Tuple[List[Dict], List[List[str]]]:
-        """
-        Args:
-          params_or_named_params: according to the way ScaledAdam is initialized in train.py,
-            this argument could be one of following 4 cases,
-            case 1, a generator of parameter, e.g.:
-              optimizer = ScaledAdam(model.parameters(), lr=params.base_lr, clipping_scale=3.0)
-
-            case 2, a list of parameter groups with different config, e.g.:
-              model_param_groups = [
-                      {'params': model.encoder.parameters(), 'lr': 0.05},
-                      {'params': model.decoder.parameters(), 'lr': 0.01},
-                      {'params': model.joiner.parameters(), 'lr': 0.03},
-                      ]
-              optimizer = ScaledAdam(model_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
-            case 3, a generator of named_parameter, e.g.:
-              optimizer = ScaledAdam(model.named_parameters(), lr=params.base_lr, clipping_scale=3.0)
-
-            case 4, a list of named_parameter groups with different config, e.g.:
-              model_named_param_groups = [
-                      {'named_params': model.encoder.named_parameters(), 'lr': 0.05},
-                      {'named_params': model.decoder.named_parameters(), 'lr': 0.01},
-                      {'named_params': model.joiner.named_parameters(), 'lr': 0.03},
-                      ]
-              optimizer = ScaledAdam(model_named_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
-          For case 1 and case 2, input params is used to initialize the underlying torch.optimizer.
-          For case 3 and case 4, firstly, names and params are extracted from input named_params,
-            then, these extracted params are used to initialize the underlying torch.optimizer,
-            and these extracted names are mainly used by function
-            `_show_gradient_dominating_parameter`
-
-        Returns:
-          Returns a tuple containing 2 elements:
-            - `param_groups` with type List[Dict], each Dict element is a parameter group.
-              An example of `param_groups` could be:
-              [
-                  {'params': `one iterable of Parameter`, 'lr': 0.05},
-                  {'params': `another iterable of Parameter`, 'lr': 0.08},
-                  {'params': `a third iterable of Parameter`, 'lr': 0.1},
-              ]
-            - `param_gruops_names` with type List[List[str]],
-               each `List[str]` is for a group['params'] in param_groups,
-               and each `str` is the name of a parameter.
-               A dummy name "foo" is related to each parameter,
-               if input are params without names, i.e. case 1 or case 2.
-        """
-        # variable naming convention in this function:
-        #   p is short for param.
-        #   np is short for named_param.
-        #   p_or_np is short for param_or_named_param.
-        #   cur is short for current.
-        #   group is a dict, e.g. {'params': iterable of parameter, 'lr': 0.05, other fields}.
-        #   groups is a List[group]
-
-        iterable_or_groups = list(params_or_named_params)
-        if len(iterable_or_groups) == 0:
-            raise ValueError("optimizer got an empty parameter list")
-
-        # The first value of returned tuple.  A list of dicts containing at
-        # least 'params' as a key.
-        param_groups = []
-
-        # The second value of returned tuple,
-        # a List[List[str]], each sub-List is for a group.
-        param_groups_names = []
-
-        if not isinstance(iterable_or_groups[0], dict):
-            # case 1 or case 3,
-            # the input is an iterable of parameter or named parameter.
-            param_iterable_cur_group = []
-            param_names_cur_group = []
-            for p_or_np in iterable_or_groups:
-                if isinstance(p_or_np, tuple):
-                    # case 3
-                    name, param = p_or_np
-                else:
-                    # case 1
-                    assert isinstance(p_or_np, torch.Tensor)
-                    param = p_or_np
-                    # Assign a dummy name as a placeholder
-                    name = "foo"
-                    self.show_dominant_parameters = False
-                param_iterable_cur_group.append(param)
-                param_names_cur_group.append(name)
-            param_groups.append({"params": param_iterable_cur_group})
-            param_groups_names.append(param_names_cur_group)
-        else:
-            # case 2 or case 4
-            # the input is groups of parameter or named parameter.
-            for cur_group in iterable_or_groups:
-                assert "named_params" in cur_group
-                name_list = [ x[0] for x in cur_group["named_params"] ]
-                p_list = [ x[1] for x in cur_group["named_params"] ]
-                del cur_group["named_params"]
-                cur_group["params"] = p_list
-                param_groups.append(cur_group)
-                param_groups_names.append(name_list)
-
-        return param_groups, param_groups_names
-
-    def __setstate__(self, state):
-        super(ScaledAdam, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        batch = True
-
-        for group, group_params_names in zip(self.param_groups, self.parameters_names):
-
-            with self.batched_params(group["params"], group_params_names) as batches:
-
-                # batches is list of pairs (stacked_param, state).  stacked_param is like
-                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
-                # a stacking dim, it is not a real dim.
-
-                if (
-                    len(batches[0][1]) == 0
-                ):  # if len(first state) == 0: not yet initialized
-                    clipping_scale = 1
-                else:
-                    clipping_scale = self._get_clipping_scale(group, batches)
-
-                for p, state, _ in batches:
-                    # Perform optimization step.
-                    # grad is not going to be None, we handled that when creating the batches.
-                    grad = p.grad
-                    if grad.is_sparse:
-                        raise RuntimeError(
-                            "ScaledAdam optimizer does not support sparse gradients"
-                        )
-                    # State initialization
-                    if len(state) == 0:
-                        self._init_state(group, p, state)
-
-                    self._step_one_batch(group, p, state, clipping_scale)
-
-        return loss
-
-    def _init_state(self, group: dict, p: Tensor, state: dict):
-        """
-        Initializes state dict for parameter 'p'.  Assumes that dim 0 of tensor p
-        is actually the batch dimension, corresponding to batched-together
-        parameters of a given shape.
-
-
-        Args:
-           group:   Dict to look up configuration values.
-               p: The parameter that we are initializing the state for
-           state: Dict from string to whatever state we are initializing
-        """
-        size_update_period = group["size_update_period"]
-
-        state["step"] = 0
-
-        kwargs = {"device": p.device, "dtype": p.dtype}
-
-        # 'delta' implements conventional momentum.  There are
-        # several different kinds of update going on, so rather than
-        # compute "exp_avg" like in Adam, we store and decay a
-        # parameter-change "delta", which combines all forms of
-        # update.  this is equivalent to how it's done in Adam,
-        # except for the first few steps.
-        state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-
-        if numel > 1:
-            # "param_rms" just periodically records the scalar root-mean-square value of
-            # the parameter tensor.
-            # it has a shape like (batch_size, 1, 1, 1, 1)
-            param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
-            state["param_rms"] = param_rms
-
-            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
-            state["scale_grads"] = torch.zeros(
-                size_update_period, *param_rms.shape, **kwargs
-            )
-
-        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
-        state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-    def _get_clipping_scale(
-        self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
-    ) -> float:
-        """
-        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
-        by this amount before applying the rest of the update.
-
-        Args:
-           group: the parameter group, an item in self.param_groups
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-        """
-        assert len(tuples) >= 1
-        clipping_scale = group["clipping_scale"]
-        (first_p, first_state, _) = tuples[0]
-        step = first_state["step"]
-        if clipping_scale is None or step == 0:
-            # no clipping.  return early on step == 0 because the other
-            # parameters' state won't have been initialized yet.
-            return 1.0
-        clipping_update_period = group["clipping_update_period"]
-
-        tot_sumsq = torch.tensor(0.0, device=first_p.device)
-        for (p, state, param_names) in tuples:
-            grad = p.grad
-            if grad.is_sparse:
-                raise RuntimeError(
-                    "ScaledAdam optimizer does not support sparse gradients"
-                )
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                tot_sumsq += (grad**2).sum()  # sum() to change shape [1] to []
-            else:
-                tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
-
-        tot_norm = tot_sumsq.sqrt()
-        if "model_norms" not in first_state:
-            first_state["model_norms"] = torch.zeros(
-                clipping_update_period, device=p.device
-            )
-        first_state["model_norms"][step % clipping_update_period] = tot_norm
-
-        if step % clipping_update_period == 0:
-            # Print some stats.
-            # We don't reach here if step == 0 because we would have returned
-            # above.
-            sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
-            quartiles = []
-            for n in range(0, 5):
-                index = min(
-                    clipping_update_period - 1, (clipping_update_period // 4) * n
-                )
-                quartiles.append(sorted_norms[index].item())
-
-            median = quartiles[2]
-            threshold = clipping_scale * median
-            first_state["model_norm_threshold"] = threshold
-            percent_clipped = (
-                first_state["num_clipped"] * 100.0 / clipping_update_period
-                if "num_clipped" in first_state
-                else 0.0
-            )
-            first_state["num_clipped"] = 0
-            quartiles = " ".join(["%.3e" % x for x in quartiles])
-            logging.info(
-                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
-                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
-            )
-
-        if step < clipping_update_period:
-            return 1.0  # We have not yet estimated a norm to clip to.
-        else:
-            try:
-                model_norm_threshold = first_state["model_norm_threshold"]
-            except KeyError:
-                logging.info(
-                    "Warning: model_norm_threshold not in state: possibly "
-                    "you changed config when restarting, adding clipping_scale option?"
-                )
-                return 1.0
-            ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
-            if ans < 1.0:
-                first_state["num_clipped"] += 1
-            if ans < 0.1:
-                logging.warn(
-                    f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
-                )
-                if self.show_dominant_parameters:
-                    assert p.shape[0] == len(param_names)
-                    self._show_gradient_dominating_parameter(tuples, tot_sumsq)
-            return ans
-
-    def _show_gradient_dominating_parameter(
-        self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor
-    ):
-        """
-        Show information of parameter which dominates tot_sumsq.
-
-        Args:
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-            tot_sumsq: sumsq of all parameters. Though it's could be calculated
-                from tuples, we still pass it to save some time.
-        """
-        all_sumsq_orig = {}
-        for (p, state, batch_param_names) in tuples:
-            # p is a stacked batch parameters.
-            batch_grad = p.grad
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                batch_sumsq_orig = batch_grad**2
-                # Dummy values used by following `zip` statement.
-                batch_rms_orig = torch.ones(p.shape[0])
-            else:
-                batch_rms_orig = state["param_rms"]
-                batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(
-                    dim=list(range(1, batch_grad.ndim))
-                )
-
-            for name, sumsq_orig, rms, grad in zip(
-                batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
-            ):
-
-                proportion_orig = sumsq_orig / tot_sumsq
-                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
-
-        assert torch.isclose(
-            sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
-            torch.tensor(1.0),
-        )
-        sorted_by_proportion = {
-            k: v
-            for k, v in sorted(
-                all_sumsq_orig.items(), key=lambda item: item[1][0], reverse=True
-            )
-        }
-        dominant_param_name = next(iter(sorted_by_proportion))
-        (
-            dominant_proportion,
-            dominant_sumsq,
-            dominant_rms,
-            dominant_grad,
-        ) = sorted_by_proportion[dominant_param_name]
-        logging.info(
-            f"Parameter dominating tot_sumsq {dominant_param_name}"
-            f" with proportion {dominant_proportion:.2f},"
-            f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
-            f"={dominant_sumsq:.3e},"
-            f" grad_sumsq={(dominant_grad**2).sum():.3e},"
-            f" orig_rms_sq={(dominant_rms**2).item():.3e}"
-        )
-
-    def _step_one_batch(
-        self, group: dict, p: Tensor, state: dict, clipping_scale: float
-    ):
-        """
-        Do the step for one parameter, which is actually going to be a batch of
-        `real` parameters, with dim 0 as the batch dim.
-        Args:
-                  group:  dict to look up configuration values
-                    p: parameter to update (actually multiple parameters stacked together
-                       as a batch)
-                  state: state-dict for p, to look up the optimizer state
-        """
-        lr = group["lr"]
-        size_update_period = group["size_update_period"]
-        beta1 = group["betas"][0]
-
-        grad = p.grad
-        if clipping_scale != 1.0:
-            grad = grad * clipping_scale
-        step = state["step"]
-        delta = state["delta"]
-
-        delta.mul_(beta1)
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-        if numel > 1:
-            # Update the size/scale of p, and set param_rms
-            scale_grads = state["scale_grads"]
-            scale_grads[step % size_update_period] = (p * grad).sum(
-                dim=list(range(1, p.ndim)), keepdim=True
-            )
-            if step % size_update_period == size_update_period - 1:
-                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
-                param_rms.copy_(
-                    (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
-                )
-                if step > 0:
-                    # self._size_update() learns the overall scale on the
-                    # parameter, by shrinking or expanding it.
-                    self._size_update(group, scale_grads, p, state)
-
-        if numel == 1:
-            # For parameters with 1 element we just use regular Adam.
-            # Updates delta.
-            self._step_scalar(group, p, state)
-        else:
-            self._step(group, p, state)
-
-        state["step"] = step + 1
-
-    def _size_update(
-        self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
-    ) -> None:
-        """
-               Called only where p.numel() > 1, this updates the scale of the parameter.
-               If we imagine: p =  underlying_param * scale.exp(), and we are doing
-               gradient descent on underlying param and on scale, this function does the update
-               on `scale`.
-
-               Args:
-              group: dict to look up configuration values
-        scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
-                      grads w.r.t. the scales.
-                  p:  The parameter to update
-               state: The state-dict of p
-        """
-
-        param_rms = state["param_rms"]
-        beta1, beta2 = group["betas"]
-        size_lr = group["lr"] * group["scalar_lr_scale"]
-        param_min_rms = group["param_min_rms"]
-        param_max_rms = group["param_max_rms"]
-        eps = group["eps"]
-        step = state["step"]
-        batch_size = p.shape[0]
-
-        size_update_period = scale_grads.shape[0]
-        # correct beta2 for the size update period: we will have
-        # faster decay at this level.
-        beta2_corr = beta2**size_update_period
-
-        scale_exp_avg_sq = state["scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
-        scale_exp_avg_sq.mul_(beta2_corr).add_(
-            (scale_grads**2).mean(dim=0),  # mean over dim `size_update_period`
-            alpha=1 - beta2_corr,
-        )  # shape is (batch_size, 1, 1, ...)
-
-        # The 1st time we reach here is when size_step == 1.
-        size_step = (step + 1) // size_update_period
-        bias_correction2 = 1 - beta2_corr**size_step
-        # we don't bother with bias_correction1; this will help prevent divergence
-        # at the start of training.
-
-        denom = scale_exp_avg_sq.sqrt() + eps
-
-        scale_step = (
-            -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
-        )
-
-        is_too_small = param_rms < param_min_rms
-
-        # when the param gets too small, just don't shrink it any further.
-        scale_step.masked_fill_(is_too_small, 0.0)
-
-        # and ensure the parameter rms after update never exceeds param_max_rms.
-        # We have to look at the trained model for parameters at or around the
-        # param_max_rms, because sometimes they can indicate a problem with the
-        # topology or settings.
-        scale_step = torch.minimum(scale_step,
-                                   (param_max_rms - param_rms) / param_rms)
-
-        delta = state["delta"]
-        # the factor of (1-beta1) relates to momentum.
-        delta.add_(p * scale_step, alpha=(1 - beta1))
-
-    def _step(self, group: dict, p: Tensor, state: dict):
-        """
-        This function does the core update of self.step(), in the case where the members of
-        the batch have more than 1 element.
-
-        Args:
-            group: A dict which will be used to look up configuration values
-                p: The parameter to be updated
-             grad: The grad of p
-            state: The state-dict corresponding to parameter p
-
-        This function modifies p.
-        """
-        grad = p.grad
-        lr = group["lr"]
-        beta1, beta2 = group["betas"]
-        eps = group["eps"]
-        param_min_rms = group["param_min_rms"]
-        step = state["step"]
-
-        exp_avg_sq = state["exp_avg_sq"]
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
-
-        this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
-        bias_correction2 = 1 - beta2 ** (this_step + 1)
-        if bias_correction2 < 0.99:
-            # note: not in-place.
-            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
-
-        denom = exp_avg_sq.sqrt()
-        denom += eps
-        grad = grad / denom
-
-        alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
-
-        delta = state["delta"]
-        delta.add_(grad * alpha)
-        p.add_(delta)
-
-    def _step_scalar(self, group: dict, p: Tensor, state: dict):
-        """
-        A simplified form of the core update for scalar tensors, where we cannot get a good
-        estimate of the parameter rms.
-        """
-        beta1, beta2 = group["betas"]
-        scalar_max = group["scalar_max"]
-        eps = group["eps"]
-        lr = group["lr"] * group["scalar_lr_scale"]
-        grad = p.grad
-
-        exp_avg_sq = state["exp_avg_sq"]  # shape: (batch_size,)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-
-        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
-        # slower update at the start will help stability anyway.
-        bias_correction2 = 1 - beta2 ** (state["step"] + 1)
-        denom = (exp_avg_sq / bias_correction2).sqrt() + eps
-
-        delta = state["delta"]
-        delta.add_(grad / denom, alpha=-lr * (1 - beta1))
-        p.clamp_(min=-scalar_max, max=scalar_max)
-        p.add_(delta)
-
-
-class LRScheduler(object):
-    """
-    Base-class for learning rate schedulers where the learning-rate depends on both the
-    batch and the epoch.
-    """
-
-    def __init__(self, optimizer: Optimizer, verbose: bool = False):
-        # Attach optimizer
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
-        self.optimizer = optimizer
-        self.verbose = verbose
-
-        for group in optimizer.param_groups:
-            group.setdefault("base_lr", group["lr"])
-
-        self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
-
-        self.epoch = 0
-        self.batch = 0
-
-    def state_dict(self):
-        """Returns the state of the scheduler as a :class:`dict`.
-
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
-        """
-        return {
-            "base_lrs": self.base_lrs,
-            "epoch": self.epoch,
-            "batch": self.batch,
-        }
-
-    def load_state_dict(self, state_dict):
-        """Loads the schedulers state.
-
-        Args:
-            state_dict (dict): scheduler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-
-    def get_last_lr(self) -> List[float]:
-        """Return last computed learning rate by current scheduler.  Will be a list of float."""
-        return self._last_lr
-
-    def get_lr(self):
-        # Compute list of learning rates from self.epoch and self.batch and
-        # self.base_lrs; this must be overloaded by the user.
-        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
-        raise NotImplementedError
-
-    def step_batch(self, batch: Optional[int] = None) -> None:
-        # Step the batch index, or just set it.  If `batch` is specified, it
-        # must be the batch index from the start of training, i.e. summed over
-        # all epochs.
-        # You can call this in any order; if you don't provide 'batch', it should
-        # of course be called once per batch.
-        if batch is not None:
-            self.batch = batch
-        else:
-            self.batch = self.batch + 1
-        self._set_lrs()
-
-    def step_epoch(self, epoch: Optional[int] = None):
-        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
-        # you should call this at the start of the epoch; if you don't provide the 'epoch'
-        # arg, you should call it at the end of the epoch.
-        if epoch is not None:
-            self.epoch = epoch
-        else:
-            self.epoch = self.epoch + 1
-        self._set_lrs()
-
-    def _set_lrs(self):
-        values = self.get_lr()
-        assert len(values) == len(self.optimizer.param_groups)
-
-        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
-            param_group, lr = data
-            param_group["lr"] = lr
-            self.print_lr(self.verbose, i, lr)
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
-
-    def print_lr(self, is_verbose, group, lr):
-        """Display the current learning rate."""
-        if is_verbose:
-            logging.info(
-                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
-                f" of group {group} to {lr:.4e}."
-            )
-
-
-class Eden(LRScheduler):
-    """
-    Eden scheduler.
-    The basic formula (before warmup) is:
-      lr = base_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
-                     (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) * warmup
-    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
-    and then stays constant at 1.
-
-
-     E.g. suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
-
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-        lr_epochs: the number of epochs after which we start significantly
-              decreasing the learning rate, suggest 6 if you plan to do e.g.
-              20 to 40 epochs, but may need smaller number if dataset is huge
-              and you will do few epochs.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        lr_epochs: Union[int, float],
-        warmup_batches: Union[int, float] = 500.0,
-        warmup_start: float = 0.5,
-        verbose: bool = False,
-    ):
-        super(Eden, self).__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.lr_epochs = lr_epochs
-        self.warmup_batches = warmup_batches
-
-        assert 0.0 <= warmup_start <= 1.0, warmup_start
-        self.warmup_start = warmup_start
-
-    def get_lr(self):
-        factor = (
-            (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
-        ) ** -0.25 * (
-            ((self.epoch**2 + self.lr_epochs**2) / self.lr_epochs**2) ** -0.25
-        )
-        warmup_factor = (
-            1.0
-            if self.batch >= self.warmup_batches
-            else self.warmup_start + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
-            # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
-        )
-
-        return [x * factor * warmup_factor for x in self.base_lrs]
-
-
-def _test_eden():
-    m = torch.nn.Linear(100, 100)
-    optim = ScaledAdam(m.parameters(), lr=0.03)
-
-    scheduler = Eden(optim, lr_batches=100, lr_epochs=2, verbose=True)
-
-    for epoch in range(10):
-        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
-
-        for step in range(20):
-            x = torch.randn(200, 100).detach()
-            x.requires_grad = True
-            y = m(x)
-            dy = torch.randn(200, 100).detach()
-            f = (y * dy).sum()
-            f.backward()
-
-            optim.step()
-            scheduler.step_batch()
-            optim.zero_grad()
-
-    logging.info(f"last lr = {scheduler.get_last_lr()}")
-    logging.info(f"state dict = {scheduler.state_dict()}")
-
-
-# This is included mostly as a baseline for ScaledAdam.
-class Eve(Optimizer):
-    """
-    Implements Eve algorithm.  This is a modified version of AdamW with a special
-    way of setting the weight-decay / shrinkage-factor, which is designed to make the
-    rms of the parameters approach a particular target_rms (default: 0.1).  This is
-    for use with networks with 'scaled' versions of modules (see scaling.py), which
-    will be close to invariant to the absolute scale on the parameter matrix.
-
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-    Eve is unpublished so far.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
-            this value means that the weight would decay significantly after
-            about 3k minibatches.  Is not multiplied by learning rate, but
-            is conditional on RMS-value of parameter being > target_rms.
-        target_rms (float, optional): target root-mean-square value of
-           parameters, if they fall below this we will stop applying weight decay.
-
-
-    .. _Adam: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.98),
-        eps=1e-8,
-        weight_decay=1e-3,
-        target_rms=0.1,
-    ):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0 <= weight_decay <= 0.1:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        if not 0 < target_rms <= 10.0:
-            raise ValueError("Invalid target_rms value: {}".format(target_rms))
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            target_rms=target_rms,
-        )
-        super(Eve, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Eve, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-
-                # Perform optimization step
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError("AdamW does not support sparse gradients")
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
-                    group["eps"]
-                )
-
-                step_size = group["lr"] / bias_correction1
-                target_rms = group["target_rms"]
-                weight_decay = group["weight_decay"]
-
-                if p.numel() > 1:
-                    # avoid applying this weight-decay on "scaling factors"
-                    # (which are scalar).
-                    is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
-                    p.mul_(1 - (weight_decay * is_above_target_rms))
-
-                p.addcdiv_(exp_avg, denom, value=-step_size)
-
-                if random.random() < 0.0005:
-                    step = (exp_avg / denom) * step_size
-                    logging.info(
-                        f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
-                    )
-
-        return loss
-
-
-def _test_scaled_adam(hidden_dim: int):
-    import timeit
-
-    from scaling import ScaledLinear
-
-    E = 100
-    B = 4
-    T = 2
-    logging.info("in test_eve_cain")
-    # device = torch.device('cuda')
-    device = torch.device("cpu")
-    dtype = torch.float32
-
-    fix_random_seed(42)
-    # these input_magnitudes and output_magnitudes are to test that
-    # Abel is working as we expect and is able to adjust scales of
-    # different dims differently.
-    input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-    output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-
-    for iter in [1, 0]:
-        fix_random_seed(42)
-        Linear = torch.nn.Linear if iter == 0 else ScaledLinear
-
-        m = torch.nn.Sequential(
-            Linear(E, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, E),
-        ).to(device)
-
-        train_pairs = [
-            (
-                100.0
-                * torch.randn(B, T, E, device=device, dtype=dtype)
-                * input_magnitudes,
-                torch.randn(B, T, E, device=device, dtype=dtype) * output_magnitudes,
-            )
-            for _ in range(20)
-        ]
-
-        if iter == 0:
-            optim = Eve(m.parameters(), lr=0.003)
-        elif iter == 1:
-            optim = ScaledAdam(m.parameters(), lr=0.03, clipping_scale=2.0)
-        scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)
-
-        start = timeit.default_timer()
-        avg_loss = 0.0
-        for epoch in range(180):
-            scheduler.step_epoch()
-            # if epoch == 100 and iter in [2,3]:
-            #    optim.reset_speedup()  # check it doesn't crash.
-
-            # if epoch == 130:
-            #    opts = diagnostics.TensorDiagnosticOptions(
-            #        2 ** 22
-            #    )  # allow 4 megabytes per sub-module
-            #    diagnostic = diagnostics.attach_diagnostics(m, opts)
-
-            for n, (x, y) in enumerate(train_pairs):
-                y_out = m(x)
-                loss = ((y_out - y) ** 2).mean() * 100.0
-                if epoch == 0 and n == 0:
-                    avg_loss = loss.item()
-                else:
-                    avg_loss = 0.98 * avg_loss + 0.02 * loss.item()
-                if n == 0 and epoch % 5 == 0:
-                    # norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
-                    # norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()
-                    # norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item()
-                    # norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item()
-                    # scale1 = '%.2e' % (m[0].weight_scale.exp().item())
-                    # scale1b = '%.2e' % (m[0].bias_scale.exp().item())
-                    # scale2 = '%.2e' % (m[2].weight_scale.exp().item())
-                    # scale2b = '%.2e' % (m[2].bias_scale.exp().item())
-                    lr = scheduler.get_last_lr()[0]
-                    logging.info(
-                        f"Iter {iter}, epoch {epoch}, batch {n}, avg_loss {avg_loss:.4g}, lr={lr:.4e}"
-                    )  # , norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b}
-                loss.log().backward()
-                optim.step()
-                optim.zero_grad()
-                scheduler.step_batch()
-
-        # diagnostic.print_diagnostics()
-
-        stop = timeit.default_timer()
-        logging.info(f"Iter={iter}, Time taken: {stop - start}")
-
-        logging.info(f"last lr = {scheduler.get_last_lr()}")
-        # logging.info("state dict = ", scheduler.state_dict())
-        # logging.info("optim state_dict = ", optim.state_dict())
-        logging.info(f"input_magnitudes = {input_magnitudes}")
-        logging.info(f"output_magnitudes = {output_magnitudes}")
-
-
-if __name__ == "__main__":
-    torch.set_num_threads(1)
-    torch.set_num_interop_threads(1)
-    logging.getLogger().setLevel(logging.INFO)
-    import subprocess
-
-    s = subprocess.check_output(
-        "git status -uno .; git log -1; git diff HEAD .", shell=True
-    )
-    logging.info(s)
-    import sys
-
-    if len(sys.argv) > 1:
-        hidden_dim = int(sys.argv[1])
-    else:
-        hidden_dim = 200
-
-    _test_scaled_adam(hidden_dim)
-    _test_eden()
diff --git a/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py b/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py
deleted file mode 100644
index d13cc08d0..000000000
--- a/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py
+++ /dev/null
@@ -1,694 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union, cast
-
-import torch
-from torch import Tensor
-from torch.nn.functional import log_softmax
-
-from fairseq2.data import Collater, SequenceData, VocabularyInfo
-from fairseq2.generation.beam_search import BeamSearch, StandardBeamSearch
-from fairseq2.generation.logits_processor import LogitsProcessor
-from fairseq2.models.encoder_decoder import Seq2SeqDecoder
-from fairseq2.nn.incremental_state import IncrementalStateBag
-from fairseq2.typing import Device
-
-
-@dataclass
-class SequenceGeneratorOptions:
-    """Holds the options to pass to a sequence generator."""
-
-    beam_size: int = 5
-    """The beam size."""
-
-    min_seq_len: int = 1
-    """The minimum length of generated sequences (including prefix sequence)."""
-
-    soft_max_seq_len: Optional[Tuple[int, int]] = (1, 200)
-    """The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source
-    sequence length. The generated sequences (including prefix sequence) will
-    have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
-    ``hard_max_seq_len``."""
-
-    hard_max_seq_len: int = 1024
-    """The hard limit on maximum length of generated sequences."""
-
-    len_penalty: float = 1.0
-    """The length penalty, where values less than 1.0 favor shorter, values
-    greater than 1.0 favor longer sequences."""
-
-    unk_penalty: float = 0.0
-    """The unknown symbol penalty, where values less than 0 produce more UNKs,
-    values greater than 0 produce fewer UNKs."""
-
-    normalize_scores: bool = True
-    """If ``True``, normalizes scores by the length of generated sequences."""
-
-    search: Optional[BeamSearch] = None
-    """The beam search algorithm to use."""
-
-    logits_processor: Optional[LogitsProcessor] = None
-    """Logits processor called before applying beam search step."""
-
-
-class Seq2SeqGenerator:
-    """Represents a sequence-to-sequence generator."""
-
-    decoder: Seq2SeqDecoder
-    opts: SequenceGeneratorOptions
-    beam_size: int
-    eos_idx: int
-    pad_idx: Optional[int]
-    unk_idx: Optional[int]
-    prefix_seq: Union[int, Tensor]
-    prefix_seq_len: int
-    search: BeamSearch
-    logits_processor: Optional[LogitsProcessor]
-    collater: Collater
-
-    def __init__(
-        self,
-        decoder: Seq2SeqDecoder,
-        vocab_info: VocabularyInfo,
-        prefix_seq: Optional[Union[int, Tensor]],
-        opts: Optional[SequenceGeneratorOptions] = None,
-    ) -> None:
-        """
-        :param decoder:
-            The decoder to use.
-        :param vocab_info:
-            The vocabulary information to use.
-        :param prefix_seq:
-            The prefix sequence, typically one or more control symbols
-            indicating the beginning of a sequence. *Shape:* :math:`()` or
-            :math:`(S)`, where :math:`S` is the sequence length. If ``None``,
-            the EOS symbol will be used as prefix.
-        :param opts:
-            The generation options.
-        """
-        self.decoder = decoder
-
-        self.opts = opts or SequenceGeneratorOptions()
-
-        # Set beam size.
-        if vocab_info.pad_idx is None:
-            self.beam_size = min(self.opts.beam_size, vocab_info.size)
-        else:
-            # -1 since we never select PAD.
-            self.beam_size = min(self.opts.beam_size, vocab_info.size - 1)
-
-        if vocab_info.eos_idx is None:
-            raise ValueError(
-                "`vocab_info` must have `eos_idx` set for sequence generation."
-            )
-
-        # Set vocab info.
-        self.eos_idx = 1
-        #self.eos_idx = vocab_info.eos_idx
-        self.unk_idx = 2
-        #self.unk_idx = vocab_info.unk_idx
-        self.pad_idx = 0
-        #self.pad_idx = vocab_info.pad_idx
-
-        # Set prefix sequence.
-        if 1:
-        #if prefix_seq is None:
-            # If `None`, we follow fairseq's convention, and use EOS as the
-            # prefix.
-            self.prefix_seq, self.prefix_seq_len = self.eos_idx, 1
-        else:
-            self.prefix_seq = prefix_seq
-
-            if isinstance(prefix_seq, Tensor):
-                num_dim = prefix_seq.dim()
-
-                if num_dim >= 2:
-                    raise ValueError(
-                        f"`prefix_seq` must be a scalar or a 1-dimensional tensor, but is {num_dim}-dimensional instead."
-                    )
-
-                self.prefix_seq_len = 1 if num_dim == 0 else prefix_seq.size(0)
-            else:
-                self.prefix_seq_len = 1
-
-        # Set beam search.
-        self.search = self.opts.search or StandardBeamSearch()
-        self.logits_processor = self.opts.logits_processor
-
-        if vocab_info.pad_idx is None:
-            self.collater = Collater()
-        else:
-            self.collater = Collater(self.pad_idx, pad_to_multiple=2)
-
-    @torch.inference_mode()
-    def __call__(
-        self,
-        encoder_output: Tensor,
-        encoder_padding_mask: Optional[Tensor],
-        source_seq_len: Optional[int] = None,
-    ) -> "SequenceGeneratorOutput":
-        opts = self.opts
-
-        num_searches = encoder_output.size(0)
-
-        beam_size = opts.beam_size
-
-        max_seq_len = self._determine_max_seq_len(source_seq_len)
-
-        device = encoder_output.device
-
-        encoder_output, encoder_padding_mask = self._fan_out_encoder_output(
-            encoder_output, encoder_padding_mask
-        )
-
-        # Each element contains the id of the search corresponding to a single
-        # source sequence and its hypotheses.
-        active_searches: List[Tuple[int, List[Hypothesis]]] = [
-            (search_idx, []) for search_idx in range(num_searches)
-        ]
-
-        # Once a source sequence has `beam_size` hypotheses, its search is moved
-        # from `active_searches` to `finished_searches`.
-        finished_searches: List[List[Hypothesis]] = [[] for i in range(num_searches)]
-
-        num_remaining_searches = num_searches
-
-        # Initialize buffers.
-        # (N x B, S)
-        seqs = torch.zeros(
-            (num_searches * beam_size, max_seq_len), device=device, dtype=torch.int64
-        )
-
-        # (N x B, S)
-        scores = torch.zeros(
-            (num_searches * beam_size, max_seq_len), device=device, dtype=torch.float32
-        )
-
-        # A list that indicates beams that should be ignored in the next step.
-        ignored_beam_mask = torch.full(
-            (num_searches, beam_size), False, device=device, dtype=torch.bool
-        )
-
-        # An offset array for converting between batch-wide and search-local
-        # beam indices.
-        # (B)
-        search_offsets = torch.arange(num_searches, device=device) * beam_size
-
-        # (B) -> (B, 1)
-        search_offsets.unsqueeze_(-1)
-
-        cand_offsets = torch.arange(2 * beam_size, device=device)
-
-        state_bag = IncrementalStateBag()
-
-        # At this point, the state is fully initialized, kick off the search.
-        self._bootstrap_seqs_and_scores(
-            seqs, scores, encoder_output, encoder_padding_mask, state_bag
-        )
-
-        start_step = self.prefix_seq_len - 1
-
-        # Holds the indices of beams (a beam can occur more than once) that we
-        # should continue with in the next step.
-        beam_indices: Optional[Tensor] = None
-
-        # Holds the indices of searches that we should continue with in the next
-        # step. If not `None`, it means we finalized one or more searches in the
-        # last step.
-        search_indices: Optional[Tensor] = None
-
-        for step_nr in range(start_step, max_seq_len - 1):
-            if beam_indices is not None:
-                # If not `None`, it means in the last step we finalized one or
-                # more searches. We should ensure that we adjust `beam_indices`
-                # before reordering `decoder`'s incremental state.
-                if search_indices is not None:
-                    num_searches = search_indices.numel()
-
-                    # (N)
-                    delta = search_indices - torch.arange(num_searches, device=device)
-
-                    # (N) -> (N, 1)
-                    delta.unsqueeze_(-1)
-
-                    # Adjust indices to take into account removed searches.
-                    beam_indices.view(num_searches, beam_size).add_(delta * beam_size)
-
-                state_bag.reorder(beam_indices)
-
-            decoder_output, decoder_padding_mask = self.decoder.decode(
-                seqs[:, step_nr : step_nr + 1],
-                None,  # We never generate PAD.
-                encoder_output,
-                encoder_padding_mask,
-                state_bag,
-            )
-
-            state_bag.increment_step()
-
-            model_output = self.decoder.project(decoder_output, decoder_padding_mask)
-
-            # lprobs:          (1, V)
-            # model_output: (N, 1, V)
-            lprobs = log_softmax(model_output.logits, dim=-1, dtype=torch.float32)
-
-            # Do not allow EOS before reaching the minimum sequence length.
-            if step_nr < self.opts.min_seq_len:
-                lprobs[:, :, self.eos_idx] = -torch.inf
-
-            # fmt: off
-            # If we have reached the maximum length, force the last step to be
-            # EOS.
-            if step_nr == max_seq_len - 2:
-                lprobs[:, :, : self.eos_idx]       = -torch.inf
-                lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
-            # fmt: on
-
-            # Never allow PAD.
-            if self.pad_idx is not None:
-                lprobs[:, :, self.pad_idx] = -torch.inf
-
-            # Apply UNK penalty.
-            if self.unk_idx is not None:
-                lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
-
-            # update scores in place using logits_processor
-            if self.logits_processor is not None:
-                self.logits_processor(
-                    seqs.view(num_searches, beam_size, -1)[:, :, : step_nr + 1],
-                    lprobs.view(num_searches, beam_size, -1),
-                )
-
-            # Determine candidates for the next step.
-            # (N, 2 x B)
-            cand_scores, cand_indices, cand_beam_indices = self.search.step(
-                step_nr,
-                step_nr == start_step,
-                lprobs.view(num_searches, beam_size, -1),
-                scores.view(num_searches, beam_size, -1)[:, :, : step_nr + 1],
-            )
-
-            # Convert search-local beam indices to batch-wide beam indices.
-            # (N, 2 x B) + (N) -> (N, 2 x B)
-            global_cand_beam_indices = cand_beam_indices + search_offsets
-
-            # Finalize beams that reached the minimum length and that end with
-            # an EOS.
-            # (N, 2 x B)
-            eos_mask = (cand_indices == self.eos_idx) & (cand_scores != -math.inf)
-
-            # Do not attempt to finalize beams that should be ignored.
-            eos_mask[:, :beam_size][ignored_beam_mask] = False
-
-            # Only consider EOS when it's among the top `beam_size` indices. Now
-            # we know what beam(s) to finalize.
-            # (N, B)
-            eos_beam_indices = torch.masked_select(
-                global_cand_beam_indices[:, :beam_size], mask=eos_mask[:, :beam_size]
-            )
-
-            if eos_beam_indices.numel() > 0:
-                # Select the scores of the finalized beams.
-                # (N, B)
-                eos_scores = torch.masked_select(
-                    cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
-                )
-
-                newly_finished_searches = self._finalize_hypothesis(
-                    step_nr,
-                    eos_beam_indices,
-                    eos_scores,
-                    seqs,
-                    scores,
-                    active_searches,
-                    finished_searches,
-                )
-
-                num_remaining_searches -= len(newly_finished_searches)
-
-                if num_remaining_searches == 0:
-                    break
-            else:
-                newly_finished_searches = None
-
-            # Remove finished searches (ones for which `beam_size` finalized
-            # beams have been generated) from the batch.
-            if newly_finished_searches:
-                new_num_searches = num_searches - len(newly_finished_searches)
-
-                # Construct `search_indices` which holds indices of searches
-                # to keep for the next step.
-                search_mask = torch.full((num_searches,), True, device=device)
-
-                search_mask[newly_finished_searches] = False
-
-                search_indices = torch.arange(num_searches, device=device)
-
-                search_indices = search_indices.masked_select(search_mask)
-
-                # fmt: off
-                # Filter out removed batches from state variables.
-                # (N, B) -> (N - F, B)
-                ignored_beam_mask = ignored_beam_mask[search_indices]
-
-                # (N, 2 x B) -> (N - F, 2 x B)
-                cand_scores       = cand_scores      [search_indices]
-                cand_indices      = cand_indices     [search_indices]
-                cand_beam_indices = cand_beam_indices[search_indices]
-
-                # (N) -> (N - F)
-                search_offsets.resize_(new_num_searches, 1)
-
-                # (N - F, 2 x B) + (N - F) -> (N - F, 2 x B)
-                global_cand_beam_indices = cand_beam_indices + search_offsets
-
-                # (N, 2 x B) -> (N - F, 2 x B)
-                eos_mask = eos_mask[search_indices]
-
-                # (N x B, S) -> (N, B, S)
-                seqs   = seqs  .view(num_searches, -1)
-                scores = scores.view(num_searches, -1)
-
-                # (N, B, S + 1) -> ((N - F) x B, S)
-                seqs   = seqs  [search_indices].view(new_num_searches * beam_size, -1)
-                scores = scores[search_indices].view(new_num_searches * beam_size, -1)
-
-                # (N x B, S_enc, M) -> (N, B, S_enc, M)
-                encoder_output = encoder_output.unflatten(0, (num_searches, -1))
-
-                # (N, B, S_enc, M) -> ((N - F) x B, S_enc, M)
-                encoder_output = encoder_output[search_indices].flatten(0, 1)
-
-                if encoder_padding_mask is not None:
-                    # (N x B, S_enc, M) -> (N, B, S_enc, M)
-                    padding_mask = encoder_padding_mask.unflatten(0, (num_searches, -1))
-
-                    # (N, B, S_enc, M) -> ((N - F) x B, S_enc, M)
-                    encoder_padding_mask = padding_mask[search_indices].flatten(0, 1)
-                # fmt: on
-
-                num_searches = new_num_searches
-            else:
-                search_indices = None
-
-            eos_mask[:, :beam_size][ignored_beam_mask] = True
-
-            # Set `beam_weights` so that values greater than or equal to 2 x
-            # `beam_size` indicate finished beams (i.e. end with EOS) and values
-            # less than 2 x `beam_size` indicate active beams.
-            # (N, 2 x B)
-            beam_weights = cand_offsets + (eos_mask * (2 * beam_size))
-
-            # Get the top `beam_size` active beams, which are the beams with the
-            # smallest weights in `active_beam_weights`.
-            # (N, B)
-            active_beam_weights, active_beams = torch.topk(
-                beam_weights, k=beam_size, dim=1, largest=False
-            )
-
-            # Update to ignore finalized beams in the next step.
-            # (N, B)
-            ignored_beam_mask = active_beam_weights >= 2 * beam_size
-
-            # We should always have at least one active beam in each search.
-            assert (~ignored_beam_mask).any(dim=1).all()
-
-            # Denotes which beams are continued for each new hypothesis (a beam
-            # can be selected more than once).
-            # (N, B)
-            beam_indices = torch.gather(
-                global_cand_beam_indices, dim=1, index=active_beams
-            )
-
-            # (N, B) -> (N x B)
-            beam_indices = beam_indices.view(-1)
-
-            # fmt: off
-            # Reorder beams in the `seq` and `score` buffers. The same beam can
-            # be selected more than once.
-            if step_nr > start_step:
-                seqs  [:, : step_nr + 1] = torch.index_select(
-                    seqs  [:, : step_nr + 1], dim=0, index=beam_indices
-                )
-                scores[:, : step_nr + 1] = torch.index_select(
-                    scores[:, : step_nr + 1], dim=0, index=beam_indices
-                )
-
-            # (N x B, S) -> (N, B, S)
-            seqs_view   = seqs  .view(num_searches, beam_size, -1)
-            scores_view = scores.view(num_searches, beam_size, -1)
-
-            seqs_view  [:, :, step_nr + 1] = torch.gather(cand_indices, dim=1, index=active_beams)
-            scores_view[:, :, step_nr + 1] = torch.gather(cand_scores,  dim=1, index=active_beams)
-            # fmt: on
-
-        # Ensure that hypotheses are sorted by their scores before returning.
-        for batch in finished_searches:
-            batch.sort(key=lambda b: b.score, reverse=True)  # type: ignore[arg-type, return-value]
-
-        return SequenceGeneratorOutput(
-            results=finished_searches, device=device, collater=self.collater
-        )
-
-    def _determine_max_seq_len(self, source_seq_len: Optional[int]) -> int:
-        opts = self.opts
-
-        if source_seq_len is None or opts.soft_max_seq_len is None:
-            max_seq_len = opts.hard_max_seq_len
-        else:
-            at, bt = opts.soft_max_seq_len
-
-            max_seq_len = min(opts.hard_max_seq_len, int(at * source_seq_len + bt))
-
-        if opts.min_seq_len > max_seq_len:
-            raise ValueError(
-                f"The effective maximum sequence length must be greater than or equal to `min_seq_len` ({opts.min_seq_len}), but is {max_seq_len} instead. Adjust your soft and hard maximum sequence length limits."
-            )
-
-        if self.prefix_seq_len >= max_seq_len:
-            raise ValueError(
-                f"The effective maximum sequence length must be greater than `prefix_seq_len` ({self.prefix_seq_len}), but is {max_seq_len} instead."
-            )
-
-        return max_seq_len
-
-    def _fan_out_encoder_output(
-        self, encoder_output: Tensor, encoder_padding_mask: Optional[Tensor]
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        num_searches = encoder_output.size(0)  # i.e. batch size
-
-        # Fan out `encoder_output` to `num_searches` x `beam_size`.
-        # (N)
-        fan_out_indices = torch.arange(num_searches, device=encoder_output.device)
-
-        # (N) -> (N x B)
-        fan_out_indices = fan_out_indices.repeat_interleave(self.beam_size)
-
-        # (N, S_enc, M) -> (N x B, S_enc, M)
-        encoder_output = encoder_output.index_select(dim=0, index=fan_out_indices)
-
-        # (N, S_enc, M) -> (N x B, S_enc, M)
-        if encoder_padding_mask is not None:
-            encoder_padding_mask = encoder_padding_mask.index_select(
-                dim=0, index=fan_out_indices
-            )
-
-        return encoder_output, encoder_padding_mask
-
-    def _bootstrap_seqs_and_scores(
-        self,
-        seqs: Tensor,
-        scores: Tensor,
-        encoder_output: Tensor,
-        encoder_padding_mask: Optional[Tensor],
-        state_bag: IncrementalStateBag,
-    ) -> None:
-        assert self.prefix_seq_len > 0
-
-        seqs[:, : self.prefix_seq_len] = self.prefix_seq
-
-        if self.prefix_seq_len == 1:
-            return
-
-        assert isinstance(self.prefix_seq, Tensor)
-
-        # We have to bootstrap the model with the already fanned-out encoder
-        # output to correctly initialize its incremental state. This causes some
-        # redundancy as we have to expand `decoder_input` to match the shape of
-        # `encoder_output`.
-        # (S_pfx) -> (N x B, S_pfx - 1)
-        decoder_input = self.prefix_seq[:-1].expand(encoder_output.size(0), -1)
-
-        # Bootstrap the model state with prefix sequence.
-        decoder_output, decoder_padding_mask = self.decoder.decode(
-            decoder_input,
-            None,
-            encoder_output,
-            encoder_padding_mask,
-            state_bag,
-        )
-
-        state_bag.increment_step(self.prefix_seq_len - 1)
-
-        model_output = self.decoder.project(decoder_output, decoder_padding_mask)
-
-        # lprobs:          (S_pfx - 1, V)
-        # model_output: (N, S_pfx - 1, V) -> (S_pfx - 1, V)
-        lprobs = log_softmax(model_output.logits[0], dim=-1, dtype=torch.float32)
-
-        # Fetch scores of next steps.
-        # (S_pfx - 1, 1)
-        prefix_scores = torch.take_along_dim(
-            lprobs, indices=self.prefix_seq[1:].unsqueeze(1), dim=-1
-        )
-
-        # (S_pfx - 1, 1) -> (S_pfx - 1)
-        prefix_scores.squeeze_(1).cumsum_(dim=0)
-
-        # First step (e.g. EOS)'s score is always 0.
-        scores[:, 1 : self.prefix_seq_len] = prefix_scores
-
-    def _finalize_hypothesis(
-        self,
-        step_nr: int,
-        eos_beam_indices: Tensor,
-        eos_scores: Tensor,
-        seqs: Tensor,
-        scores: Tensor,
-        active_searches: List[Tuple[int, List["Hypothesis"]]],
-        finished_searches: List[List["Hypothesis"]],
-    ) -> List[int]:
-        # fmt: off
-        finalized_seqs   = seqs  .index_select(dim=0, index=eos_beam_indices)
-        finalized_scores = scores.index_select(dim=0, index=eos_beam_indices)
-
-        finalized_seqs   = finalized_seqs  [:, : step_nr + 2]
-        finalized_scores = finalized_scores[:, : step_nr + 2]
-
-        # Finalize beams.
-        finalized_seqs  [:, -1] = self.eos_idx
-        finalized_scores[:, -1] = eos_scores
-        # fmt: on
-
-        # Convert from cumulative to per-step scores.
-        finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1]
-
-        # Skip first EOS since it is always 0 and skews normalization.
-        if self.opts.normalize_scores:
-            eos_scores /= (step_nr + 1) ** self.opts.len_penalty
-
-        # Holds the ids of finished searches.
-        newly_finished: List[int] = []
-
-        active_search_indices = (eos_beam_indices // self.beam_size).tolist()
-
-        for beam_idx, search_idx in enumerate(active_search_indices):
-            search_id, hypotheses = active_searches[search_idx]
-
-            # We might have more than one beam finalized in one step that would
-            # potentially exceed `beam_size` hypotheses.
-            if len(hypotheses) == self.beam_size:
-                continue
-
-            hypotheses.append(
-                Hypothesis(
-                    seq=finalized_seqs[beam_idx],
-                    score=eos_scores[beam_idx],
-                    step_scores=finalized_scores[beam_idx],
-                )
-            )
-
-            if len(hypotheses) == self.beam_size:
-                # We have `beam_size` hypotheses for this particular search, so
-                # we finish it now.
-                newly_finished.append(search_idx)
-
-                finished_searches[search_id] = hypotheses
-
-        newly_finished.sort()
-
-        # Remove finished searches from the active list.
-        for idx in reversed(newly_finished):
-            del active_searches[idx]
-
-        return newly_finished
-
-
-@dataclass
-class SequenceGeneratorOutput:
-    """Holds the output of a sequence generator."""
-
-    results: List[List["Hypothesis"]]
-    """The list of hypothesis generated per search, ordered by score."""
-
-    device: Device
-    """The device on which generated sequences reside."""
-
-    collater: Optional[Collater] = None
-    """The collater to use in :meth:`collate`."""
-
-    def collate(
-        self, hypo_idx: int = 0, skip_batch: bool = False
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        """Collate the generated sequences at index ``hypo_idx`` in each search
-        result into a single tensor.
-
-        :param hypo_idx:
-            The index of hypothesis to extract from each search result.
-        :param skip_batch:
-            If ``True``, if a search result has no hypothesis at index `hypo_idx`,
-            it will be skipped instead of raising an error.
-
-        :returns:
-          - The collated sequences. *Shape:* :math:`(N,S)`, where :math:`N` is
-            the number of search results and :math:`S` is the sequence length.
-          - An array where each element represents the length of the sequence at
-            the same index in the first returned value. *Shape:* :math:`(N)`,
-            where :math:`N` is the number of search results.
-        """
-        if self.collater is None:
-            raise RuntimeError("The output has no associated `Collater` instance.")
-
-        if not self.results and not skip_batch:
-            raise ValueError("The output must contain at least one search result.")
-
-        seqs = []
-
-        for search_idx, result in enumerate(self.results):
-            if hypo_idx >= len(result):
-                if not skip_batch:
-                    raise ValueError(
-                        f"Each search result must have at least {hypo_idx + 1} hypotheses, but search {search_idx} has only {len(result)}."
-                    )
-
-                continue
-
-            seqs.append(result[hypo_idx].seq)
-
-        if not seqs:
-            # Return a zero-dimensional (not scalar!) tensor.
-            return torch.empty((0,), device=self.device, dtype=torch.int64), None
-
-        output = cast(SequenceData, self.collater(seqs))
-
-        return output["seqs"], output["seq_lens"] if output["is_ragged"] else None
-
-
-@dataclass
-class Hypothesis:
-    """Represents a hypothesis produced by a sequence generator."""
-
-    seq: Tensor
-    """The generated sequence."""
-
-    score: Tensor
-    """The score of the hypothesis."""
-
-    step_scores: Tensor
-    """The score of each individual sequence step."""
diff --git a/egs/aishell/ASR/seamlessm4t/requirements.txt b/egs/aishell/ASR/seamlessm4t/requirements.txt
deleted file mode 100644
index 7647735da..000000000
--- a/egs/aishell/ASR/seamlessm4t/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-#k2
-kaldialign
-lhotse
-sentencepiece
-tensorboard
-fairseq2
diff --git a/egs/aishell/ASR/seamlessm4t/tokenizer.py b/egs/aishell/ASR/seamlessm4t/tokenizer.py
deleted file mode 100644
index 1012cd8a0..000000000
--- a/egs/aishell/ASR/seamlessm4t/tokenizer.py
+++ /dev/null
@@ -1,43 +0,0 @@
-
-#import sentencepiece as spm
-
-class CharTokenizer(object):
-    def __init__(self, tokenizer_file):
-        self.id2symbol = {}
-        self.symbol2id = {}
-        with open(tokenizer_file, 'r') as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    symbol, id = line.split()
-                    id = int(id)
-                    self.id2symbol[id] = symbol
-                    self.symbol2id[symbol] = id
-        self.vocab_size = len(self.id2symbol)
-
-    def encode(self, text):
-        # if symbol not in self.symbol2id, using <unk>'s id
-        return [self.symbol2id.get(symbol, 2) for symbol in text]
-
-    def decode(self, ids):
-        return ''.join([self.id2symbol[id] for id in ids])
-
-if __name__ == '__main__':
-    # config_file = './config.yaml'
-    # config = read_yaml(config_file)
-    # converter = TokenIDConverter(config['token_list'])
-    # ids = converter.tokens2ids(['<s>', '你', '好', '吗', '</s>', 'microsoft', 'world'])
-    # print(ids)
-    # print(converter.ids2tokens(ids))
-
-
-    tokenizer = CharTokenizer('./tokens.txt')
-    ids = tokenizer.encode('今天 天气不错')
-    print(ids)
-    print(tokenizer.decode(ids+[1]))
-    # sp = spm.SentencePieceProcessor()
-    # sp.Load('../../../librispeech/ASR/k2fsa-zipformer-chinese-english-mixed/data/lang_char_bpe/bpe.model')
-    # texts = ['MICROSOFT  WORLD']
-    # y = sp.encode(texts, out_type=int)
-    # x = sp.decode(y)
-    # print(y, x)
\ No newline at end of file
diff --git a/egs/aishell/ASR/seamlessm4t/tokens.txt b/egs/aishell/ASR/seamlessm4t/tokens.txt
deleted file mode 100644
index 980dd6cd2..000000000
--- a/egs/aishell/ASR/seamlessm4t/tokens.txt
+++ /dev/null
@@ -1,6257 +0,0 @@
-<blk> 0
-<sos/eos> 1
-<unk> 2
-▁AS 3
-▁ONE 4
-▁OF 5
-▁A 6
-▁COMP 7
-AN 8
-Y 9
-▁SOME 10
-▁HUNDRED 11
-▁AND 12
-▁FI 13
-F 14
-T 15
-▁ME 16
-N 17
-▁WO 18
-M 19
-EN 20
-▁NOT 21
-▁LA 22
-BO 23
-UR 24
-ING 25
-▁UNDER 26
-▁ANY 27
-▁SU 28
-S 29
-P 30
-IC 31
-ION 32
-▁ 33
-LU 34
-AC 35
-▁THAT 36
-▁THE 37
-▁CO 38
-▁CHA 39
-CE 40
-RY 41
-▁THOUGH 42
-▁SH 43
-IN 44
-B 45
-J 46
-E 47
-C 48
-▁MUCH 49
-▁PO 50
-UL 51
-AR 52
-▁PRE 53
-U 54
-DI 55
-▁AT 56
-▁WHICH 57
-▁POINT 58
-▁I 59
-▁JU 60
-D 61
-GE 62
-▁E 63
-▁HAD 64
-▁CA 65
-▁IN 66
-▁MY 67
-▁DI 68
-RE 69
-CTION 70
-▁WAS 71
-L 72
-MO 73
-ATE 74
-▁THERE 75
-▁BEEN 76
-▁HE 77
-IT 78
-TED 79
-▁T 80
-RI 81
-VI 82
-AL 83
-▁B 84
-LE 85
-ISH 86
-▁OR 87
-▁SO 88
-▁IT 89
-▁RA 90
-TE 91
-▁PRO 92
-G 93
-▁BUT 94
-▁THIS 95
-▁EX 96
-A 97
-ER 98
-ATED 99
-▁EN 100
-TI 101
-LY 102
-OW 103
-▁TO 104
-▁PA 105
-R 106
-IM 107
-ON 108
-▁PUBLIC 109
-▁GU 110
-IL 111
-▁APP 112
-ED 113
-▁UN 114
-ENT 115
-▁MOST 116
-▁DE 117
-TER 118
-▁MAN 119
-▁ON 120
-▁BY 121
-▁NO 122
-▁MEAN 123
-▁NUMBER 124
-O 125
-▁BELIEVE 126
-▁RI 127
-CH 128
-▁SECOND 129
-▁OTHER 130
-▁K 131
-▁WILL 132
-▁DO 133
-▁WELL 134
-▁SEE 135
-UND 136
-▁JO 137
-KE 138
-▁BE 139
-▁BO 140
-OK 141
-▁COULD 142
-▁THEM 143
-▁THESE 144
-▁P 145
-AGE 146
-▁SHA 147
-ME 148
-IOUS 149
-▁IS 150
-▁ONLY 151
-▁WORD 152
-▁RE 153
-K 154
-I 155
-▁SHOULD 156
-▁HAVE 157
-OR 158
-ITY 159
-▁WHAT 160
-LL 161
-▁SP 162
-TA 163
-NE 164
-OUS 165
-▁COM 166
-US 167
-TION 168
-▁HAS 169
-▁S 170
-TH 171
-▁MI 172
-RO 173
-▁GOOD 174
-▁FRIEND 175
-▁LE 176
-W 177
-ES 178
-▁WITH 179
-▁WHO 180
-▁THINK 181
-▁MU 182
-IG 183
-▁MO 184
-▁MIGHT 185
-▁AN 186
-QU 187
-ATION 188
-▁FROM 189
-▁NA 190
-TURE 191
-▁ALL 192
-IES 193
-▁LET 194
-▁TIME 195
-▁WHEN 196
-▁EVEN 197
-▁NEED 198
-▁O 199
-VE 200
-▁WI 201
-FUL 202
-▁NE 203
-LI 204
-AD 205
-ERS 206
-▁BEFORE 207
-▁WORK 208
-▁LIKE 209
-▁HAND 210
-▁THEN 211
-SH 212
-▁WERE 213
-OM 214
-▁KNOW 215
-EL 216
-MA 217
-▁OVER 218
-▁LO 219
-TING 220
-▁LI 221
-OL 222
-▁HA 223
-MP 224
-LA 225
-ABLE 226
-MB 227
-▁WE 228
-▁ST 229
-ET 230
-▁IF 231
-▁WA 232
-▁NEW 233
-IR 234
-▁FA 235
-▁G 236
-IAN 237
-ND 238
-VER 239
-IS 240
-▁DIS 241
-CO 242
-UN 243
-▁SEVEN 244
-▁TH 245
-▁AFTER 246
-WARD 247
-▁RO 248
-▁WOULD 249
-▁SA 250
-▁FOR 251
-▁FE 252
-▁LONG 253
-▁STILL 254
-▁CON 255
-X 256
-▁HERE 257
-▁EVERYTHING 258
-▁SE 259
-ANT 260
-ALLY 261
-RU 262
-▁THAN 263
-ANCE 264
-▁ARE 265
-▁NEXT 266
-▁DOWN 267
-▁CHI 268
-▁MA 269
-CK 270
-▁DR 271
-Z 272
-▁F 273
-ROW 274
-▁GO 275
-▁INTO 276
-INE 277
-▁SIX 278
-▁YEAR 279
-▁HIS 280
-CI 281
-ENCE 282
-▁CLOSE 283
-▁LAST 284
-▁C 285
-▁MORE 286
-▁THOUSAND 287
-LO 288
-▁UP 289
-▁WANT 290
-▁JA 291
-▁GA 292
-▁THROUGH 293
-▁PLACE 294
-▁HU 295
-MAN 296
-UGH 297
-IST 298
-▁HO 299
-▁BETTER 300
-▁THEIR 301
-▁VERY 302
-NG 303
-▁ANOTHER 304
-▁UM 305
-AS 306
-▁TE 307
-V 308
-▁HER 309
-▁HIGH 310
-QUI 311
-▁BECAUSE 312
-▁SHOW 313
-▁WHERE 314
-▁DAY 315
-▁BRO 316
-▁OLD 317
-MENT 318
-▁HARD 319
-▁THOSE 320
-CU 321
-AT 322
-▁INTEREST 323
-▁EVERY 324
-HE 325
-▁GIVE 326
-▁FACT 327
-▁FOUND 328
-▁HEAR 329
-▁NEVER 330
-▁CAN 331
-▁COME 332
-▁SORT 333
-ITION 334
-EST 335
-IGHT 336
-SHIP 337
-SIDE 338
-▁GREAT 339
-▁MAR 340
-▁GET 341
-▁OUT 342
-▁STA 343
-OP 344
-AIN 345
-▁PI 346
-RA 347
-ID 348
-▁DA 349
-▁LOOK 350
-END 351
-▁BA 352
-VO 353
-H 354
-▁LITTLE 355
-▁THEY 356
-▁ALWAYS 357
-CA 358
-▁HOUSE 359
-TIC 360
-IVE 361
-▁RU 362
-HO 363
-▁RIGHT 364
-ICAL 365
-HA 366
-▁AGAIN 367
-▁EXP 368
-IBLE 369
-▁CERTAIN 370
-▁SHE 371
-▁PRI 372
-▁PLAY 373
-▁VA 374
-LIC 375
-▁TWO 376
-▁THREE 377
-▁DOES 378
-▁YOU 379
-▁TALK 380
-▁ABOUT 381
-UT 382
-▁WITHOUT 383
-▁PEOPLE 384
-FF 385
-▁DON 386
-▁HAPPEN 387
-▁PERSON 388
-▁MADE 389
-▁PART 390
-▁HOW 391
-▁WHY 392
-▁MAKE 393
-▁HIM 394
-▁STATE 395
-▁GRA 396
-▁TRA 397
-FOR 398
-▁LIFE 399
-▁PER 400
-AM 401
-▁DU 402
-PORT 403
-IA 404
-▁BUSINESS 405
-▁UNDERSTAND 406
-▁PLAN 407
-▁KEEP 408
-▁GOT 409
-▁MONEY 410
-▁OWN 411
-▁MANY 412
-ONE 413
-▁BU 414
-▁REALLY 415
-▁COURSE 416
-▁THINGS 417
-▁SAID 418
-▁US 419
-▁WAY 420
-▁TAKE 421
-▁WORLD 422
-▁THING 423
-▁SAY 424
-▁TA 425
-▁SOMETHING 426
-▁LEARN 427
-DE 428
-▁QUESTION 429
-DAY 430
-▁NOW 431
-▁BACK 432
-▁YOUR 433
-ACH 434
-▁SPEAK 435
-▁TURN 436
-ATIVE 437
-▁OUR 438
-▁JE 439
-▁VI 440
-▁HI 441
-▁OP 442
-▁WEEK 443
-▁IDEA 444
-▁AWAY 445
-▁COUNT 446
-NESS 447
-▁REASON 448
-PH 449
-▁TWENTY 450
-▁QUITE 451
-▁CHANGE 452
-▁LOVE 453
-DUC 454
-▁SAME 455
-▁ENOUGH 456
-▁YES 457
-▁FEEL 458
-▁FIRST 459
-▁WHILE 460
-IZE 461
-▁ANYTHING 462
-▁DID 463
-▁KIND 464
-▁POWER 465
-▁JUST 466
-IF 467
-▁PH 468
-▁ANSWER 469
-▁FIND 470
-▁THANK 471
-▁BUILD 472
-▁GOING 473
-▁CAME 474
-▁TOGETHER 475
-▁IMPORTANT 476
-▁HELP 477
-▁FOUR 478
-▁DIFFERENT 479
-▁AROUND 480
-▁AMERICA 481
-▁ALSO 482
-▁NINE 483
-▁AH 484
-▁LOT 485
-▁BETWEEN 486
-▁START 487
-▁SCHOOL 488
-▁SYSTEM 489
-▁MAYBE 490
-▁ACTUALLY 491
-▁PROBLEM 492
-Q 493
-▁MR 494
-▁YEAH 495
-▁OKAY 496
-以 497
-后 498
-你 499
-是 500
-男 501
-孩 502
-子 503
-曾 504
-丽 505
-婷 506
-妈 507
-很 508
-想 509
-兰 510
-州 511
-哪 512
-有 513
-买 514
-路 515
-虎 516
-汽 517
-车 518
-的 519
-我 520
-家 521
-狗 522
-叫 523
-什 524
-么 525
-名 526
-字 527
-现 528
-在 529
-网 530
-络 531
-怎 532
-样 533
-鞍 534
-山 535
-到 536
-郑 537
-大 538
-巴 539
-上 540
-没 541
-厕 542
-所 543
-英 544
-文 545
-给 546
-一 547
-个 548
-惊 549
-喜 550
-看 551
-日 552
-程 553
-表 554
-形 555
-容 556
-人 557
-强 558
-势 559
-面 560
-前 561
-不 562
-敢 563
-吭 564
-声 565
-词 566
-儿 567
-些 568
-办 569
-法 570
-说 571
-话 572
-唱 573
-首 574
-老 575
-与 576
-海 577
-婆 578
-笨 579
-蛋 580
-为 581
-欢 582
-女 583
-呢 584
-杭 585
-里 586
-可 587
-打 588
-炮 589
-朋 590
-友 591
-啊 592
-对 593
-练 594
-语 595
-听 596
-力 597
-帮 598
-助 599
-歌 600
-请 601
-推 602
-荐 603
-几 604
-谢 605
-明 606
-天 607
-早 608
-七 609
-点 610
-起 611
-床 612
-时 613
-候 614
-睡 615
-觉 616
-会 617
-了 618
-吗 619
-查 620
-本 621
-地 622
-气 623
-公 624
-快 625
-吧 626
-注 627
-意 628
-安 629
-全 630
-要 631
-错 632
-过 633
-四 634
-川 635
-再 636
-讲 637
-笑 638
-好 639
-午 640
-连 641
-烟 642
-台 643
-轮 644
-船 645
-史 646
-记 647
-商 648
-君 649
-列 650
-传 651
-孙 652
-鞅 653
-乃 654
-遂 655
-西 656
-入 657
-秦 658
-翻 659
-译 660
-做 661
-下 662
-提 663
-醒 664
-发 665
-送 666
-排 667
-班 668
-邮 669
-件 670
-刘 671
-俊 672
-峰 673
-电 674
-播 675
-放 676
-曲 677
-最 678
-炫 679
-民 680
-族 681
-风 682
-还 683
-年 684
-中 685
-泰 686
-拉 687
-石 688
-光 689
-剑 690
-任 691
-务 692
-玛 693
-雅 694
-预 695
-言 696
-真 697
-视 698
-频 699
-爱 700
-范 701
-冰 702
-出 703
-演 704
-剧 705
-六 706
-三 707
-十 708
-分 709
-退 710
-当 711
-模 712
-式 713
-附 714
-近 715
-洗 716
-浴 717
-方 718
-交 719
-和 720
-玩 721
-见 722
-工 723
-作 724
-干 725
-就 726
-能 727
-疖 728
-吃 729
-饭 730
-或 731
-者 732
-其 733
-他 734
-东 735
-国 736
-移 737
-动 738
-通 739
-信 740
-限 741
-司 742
-介 743
-绍 744
-讯 745
-录 746
-载 747
-装 748
-跟 749
-位 750
-置 751
-离 752
-莞 753
-长 754
-站 755
-多 756
-远 757
-福 758
-建 759
-高 760
-速 761
-服 762
-少 763
-把 764
-这 765
-定 766
-成 767
-彩 768
-铃 769
-手 770
-机 771
-别 772
-忘 773
-盒 774
-拿 775
-回 776
-校 777
-区 778
-万 779
-口 780
-坐 781
-冷 782
-漠 783
-醉 784
-红 785
-颜 786
-来 787
-猪 788
-张 789
-波 790
-炎 791
-亚 792
-纶 793
-媳 794
-妇 795
-马 796
-志 797
-华 798
-短 799
-清 800
-图 801
-片 802
-生 803
-五 804
-钟 805
-开 806
-启 807
-蓝 808
-牙 809
-锂 810
-池 811
-初 812
-次 813
-充 814
-让 815
-今 816
-号 817
-顷 818
-等 819
-于 820
-平 821
-也 822
-呀 823
-聊 824
-问 825
-主 826
-结 827
-婚 828
-恭 829
-情 830
-流 831
-返 832
-洒 833
-热 834
-泪 835
-诗 836
-那 837
-去 838
-眠 839
-药 840
-功 841
-使 842
-用 843
-象 844
-间 845
-估 846
-计 847
-厚 848
-德 849
-物 850
-思 851
-搞 852
-们 853
-只 854
-知 855
-道 856
-奥 857
-特 858
-曼 859
-越 860
-野 861
-比 862
-较 863
-重 864
-新 865
-陈 866
-奕 867
-迅 868
-泉 869
-湾 870
-票 871
-呵 872
-活 873
-经 874
-历 875
-摇 876
-痴 877
-此 878
-刻 879
-呈 880
-报 881
-脑 882
-总 883
-着 884
-她 885
-阿 886
-斯 887
-顿 888
-丁 889
-影 890
-终 891
-幻 892
-雨 893
-否 894
-带 895
-伞 896
-休 897
-息 898
-值 899
-百 900
-每 901
-月 902
-找 903
-扎 904
-屯 905
-线 906
-仙 907
-奇 908
-侠 909
-业 910
-写 911
-完 912
-姐 913
-稍 914
-烤 915
-鹅 916
-肠 917
-团 918
-狩 919
-猎 920
-美 921
-忍 922
-向 923
-冲 924
-常 925
-熟 926
-度 927
-折 928
-扣 929
-二 930
-乘 931
-教 932
-实 933
-认 934
-证 935
-考 936
-试 937
-答 938
-案 939
-费 940
-脸 941
-自 942
-恋 943
-广 944
-府 945
-待 946
-赶 947
-集 948
-告 949
-诉 950
-太 951
-懂 952
-坏 953
-泡 954
-妞 955
-伊 956
-更 957
-健 958
-康 959
-检 960
-乙 961
-肝 962
-厦 963
-门 964
-急 965
-控 966
-毒 967
-产 968
-头 969
-脚 970
-轻 971
-感 972
-空 973
-订 974
-从 975
-碑 976
-店 977
-北 978
-京 979
-火 980
-鸡 981
-白 982
-态 983
-啥 984
-幺 985
-哈 986
-尔 987
-滨 988
-至 989
-硬 990
-座 991
-换 992
-姿 993
-亲 994
-级 995
-紫 996
-秋 997
-叶 998
-刀 999
-爆 1000
-率 1001
-馨 1002
-予 1003
-帅 1004
-汇 1005
-询 1006
-小 1007
-唉 1008
-性 1009
-挺 1010
-厉 1011
-害 1012
-八 1013
-米 1014
-跑 1015
-步 1016
-达 1017
-标 1018
-规 1019
-准 1020
-码 1021
-音 1022
-拨 1023
-器 1024
-油 1025
-价 1026
-卡 1027
-尿 1028
-防 1029
-杀 1030
-死 1031
-倍 1032
-晋 1033
-映 1034
-部 1035
-韩 1036
-己 1037
-右 1038
-键 1039
-编 1040
-辑 1041
-您 1042
-房 1043
-租 1044
-具 1045
-体 1046
-牌 1047
-瑞 1048
-士 1049
-浪 1050
-琴 1051
-简 1052
-永 1053
-拥 1054
-灿 1055
-烂 1056
-朝 1057
-阳 1058
-解 1059
-赢 1060
-事 1061
-滚 1062
-周 1063
-吕 1064
-忠 1065
-页 1066
-心 1067
-份 1068
-被 1069
-龙 1070
-肖 1071
-半 1072
-喊 1073
-改 1074
-堵 1075
-饿 1076
-瞌 1077
-叔 1078
-戏 1079
-兴 1080
-化 1081
-备 1082
-萍 1083
-乡 1084
-钓 1085
-鱼 1086
-岛 1087
-消 1088
-忻 1089
-襄 1090
-县 1091
-况 1092
-得 1093
-王 1094
-属 1095
-假 1096
-期 1097
-像 1098
-脾 1099
-零 1100
-九 1101
-关 1102
-晚 1103
-陪 1104
-危 1105
-苦 1106
-难 1107
-例 1108
-顺 1109
-序 1110
-盖 1111
-茨 1112
-邦 1113
-故 1114
-警 1115
-戒 1116
-搜 1117
-索 1118
-黄 1119
-照 1120
-底 1121
-识 1122
-武 1123
-汉 1124
-第 1125
-博 1126
-基 1127
-尼 1128
-删 1129
-除 1130
-沃 1131
-狐 1132
-场 1133
-卖 1134
-便 1135
-宜 1136
-营 1137
-厅 1138
-资 1139
-料 1140
-哥 1141
-句 1142
-员 1143
-随 1144
-导 1145
-航 1146
-学 1147
-星 1148
-脱 1149
-毛 1150
-膏 1151
-种 1152
-乐 1153
-贵 1154
-妃 1155
-酒 1156
-内 1157
-条 1158
-联 1159
-系 1160
-希 1161
-望 1162
-非 1163
-苹 1164
-果 1165
-银 1166
-登 1167
-户 1168
-密 1169
-师 1170
-沟 1171
-莫 1172
-才 1173
-喝 1174
-累 1175
-彭 1176
-裘 1177
-莹 1178
-珠 1179
-徐 1180
-管 1181
-爸 1182
-敏 1183
-腰 1184
-闪 1185
-舟 1186
-凯 1187
-肯 1188
-水 1189
-济 1190
-闻 1191
-约 1192
-伦 1193
-乱 1194
-舞 1195
-春 1196
-叮 1197
-咚 1198
-昨 1199
-又 1200
-宵 1201
-省 1202
-途 1203
-无 1204
-院 1205
-吉 1206
-农 1207
-股 1208
-胶 1209
-布 1210
-谁 1211
-免 1212
-疫 1213
-逗 1214
-闹 1215
-闭 1216
-青 1217
-景 1218
-花 1219
-园 1220
-富 1221
-桥 1222
-哭 1223
-节 1224
-树 1225
-茂 1226
-盛 1227
-边 1228
-余 1229
-姚 1230
-走 1231
-原 1232
-行 1233
-都 1234
-旅 1235
-馆 1236
-宾 1237
-根 1238
-修 1239
-理 1240
-厂 1241
-板 1242
-诊 1243
-专 1244
-复 1245
-鹰 1246
-队 1247
-桌 1248
-财 1249
-港 1250
-色 1251
-诚 1252
-勿 1253
-扰 1254
-持 1255
-孟 1256
-古 1257
-医 1258
-研 1259
-究 1260
-取 1261
-卫 1262
-目 1263
-铁 1264
-麻 1265
-将 1266
-浏 1267
-览 1268
-湿 1269
-朱 1270
-沱 1271
-合 1272
-江 1273
-何 1274
-祈 1275
-祷 1276
-义 1277
-酷 1278
-派 1279
-如 1280
-咩 1281
-正 1282
-算 1283
-楼 1284
-距 1285
-震 1286
-借 1287
-政 1288
-策 1289
-温 1290
-宝 1291
-沙 1292
-角 1293
-肚 1294
-疼 1295
-林 1296
-宿 1297
-舍 1298
-阴 1299
-晴 1300
-圆 1301
-缺 1302
-微 1303
-辽 1304
-刚 1305
-牛 1306
-偷 1307
-吓 1308
-跳 1309
-城 1310
-吴 1311
-称 1312
-呼 1313
-爷 1314
-埋 1315
-怨 1316
-缠 1317
-柴 1318
-钱 1319
-极 1320
-先 1321
-辞 1322
-职 1323
-哦 1324
-啦 1325
-售 1326
-保 1327
-黑 1328
-客 1329
-立 1330
-足 1331
-之 1332
-鬼 1333
-留 1334
-辟 1335
-邪 1336
-谱 1337
-减 1338
-肥 1339
-皇 1340
-萨 1341
-舅 1342
-幽 1343
-默 1344
-细 1345
-胞 1346
-溪 1347
-懒 1348
-书 1349
-杨 1350
-慧 1351
-芝 1352
-屁 1353
-画 1354
-晨 1355
-宁 1356
-侦 1357
-探 1358
-柯 1359
-南 1360
-涛 1361
-应 1362
-该 1363
-弑 1364
-神 1365
-魔 1366
-创 1367
-世 1368
-游 1369
-犯 1370
-已 1371
-泽 1372
-村 1373
-变 1374
-奏 1375
-杰 1376
-偶 1377
-命 1378
-乔 1379
-恩 1380
-并 1381
-阅 1382
-读 1383
-左 1384
-般 1385
-低 1386
-调 1387
-阔 1388
-烦 1389
-球 1390
-峡 1391
-界 1392
-霆 1393
-锋 1394
-柏 1395
-需 1396
-艺 1397
-术 1398
-弄 1399
-章 1400
-悲 1401
-咒 1402
-版 1403
-统 1404
-羊 1405
-肉 1406
-斤 1407
-嘿 1408
-郁 1409
-闷 1410
-进 1411
-姨 1412
-庆 1413
-威 1414
-两 1415
-岸 1416
-普 1417
-琪 1418
-玫 1419
-瑰 1420
-香 1421
-碟 1422
-眼 1423
-收 1424
-湖 1425
-禽 1426
-涮 1427
-汪 1428
-盘 1429
-夹 1430
-存 1431
-枕 1432
-指 1433
-针 1434
-仓 1435
-库 1436
-餐 1437
-块 1438
-咪 1439
-毕 1440
-薪 1441
-赛 1442
-纪 1443
-末 1444
-典 1445
-娱 1446
-傻 1447
-嘲 1448
-彪 1449
-升 1450
-润 1451
-核 1452
-遍 1453
-接 1454
-元 1455
-鼓 1456
-屿 1457
-爹 1458
-胡 1459
-雪 1460
-招 1461
-喂 1462
-齿 1463
-侣 1464
-土 1465
-豆 1466
-铿 1467
-锵 1468
-聪 1469
-但 1470
-飞 1471
-鹤 1472
-壁 1473
-摞 1474
-久 1475
-冬 1476
-骏 1477
-然 1478
-讨 1479
-论 1480
-腾 1481
-易 1482
-筋 1483
-转 1484
-弯 1485
-耳 1486
-齐 1487
-阶 1488
-革 1489
-代 1490
-许 1491
-圣 1492
-诞 1493
-吸 1494
-血 1495
-燕 1496
-松 1497
-鼠 1498
-确 1499
-凤 1500
-凰 1501
-由 1502
-翔 1503
-段 1504
-超 1505
-精 1506
-支 1507
-扶 1508
-室 1509
-包 1510
-菜 1511
-田 1512
-骂 1513
-洋 1514
-舒 1515
-衡 1516
-款 1517
-嘴 1518
-菲 1519
-嘛 1520
-嗨 1521
-鸟 1522
-玉 1523
-数 1524
-贝 1525
-郝 1526
-漫 1527
-诺 1528
-衣 1529
-嫖 1530
-娼 1531
-穿 1532
-骨 1533
-拆 1534
-伤 1535
-获 1536
-奖 1537
-稿 1538
-锅 1539
-购 1540
-治 1541
-痘 1542
-遗 1543
-疤 1544
-痕 1545
-饱 1546
-寻 1547
-瓷 1548
-妹 1549
-徽 1550
-参 1551
-格 1552
-题 1553
-凭 1554
-住 1555
-剖 1556
-腹 1557
-祝 1558
-贺 1559
-加 1560
-跪 1561
-潞 1562
-云 1563
-端 1564
-木 1565
-烁 1566
-朵 1567
-赵 1568
-潘 1569
-津 1570
-滋 1571
-燥 1572
-利 1573
-负 1574
-昆 1575
-因 1576
-森 1577
-及 1578
-病 1579
-固 1580
-市 1581
-烧 1582
-番 1583
-茄 1584
-炒 1585
-而 1586
-凉 1587
-冒 1588
-量 1589
-夸 1590
-尾 1591
-崔 1592
-另 1593
-处 1594
-铺 1595
-沈 1596
-哎 1597
-身 1598
-哟 1599
-习 1600
-虞 1601
-瞧 1602
-烈 1603
-皮 1604
-鞋 1605
-深 1606
-圳 1607
-委 1608
-胖 1609
-猴 1610
-军 1611
-素 1612
-楷 1613
-补 1614
-偿 1615
-屏 1616
-散 1617
-效 1618
-丹 1619
-念 1620
-绝 1621
-艳 1622
-够 1623
-狼 1624
-且 1625
-龄 1626
-乌 1627
-蓉 1628
-厘 1629
-含 1630
-庚 1631
-澈 1632
-犬 1633
-致 1634
-运 1635
-慢 1636
-钻 1637
-李 1638
-轩 1639
-育 1640
-项 1641
-咱 1642
-误 1643
-弟 1644
-依 1645
-尽 1646
-河 1647
-夫 1648
-沁 1649
-始 1650
-芳 1651
-禺 1652
-旧 1653
-坑 1654
-胜 1655
-酱 1656
-漂 1657
-亮 1658
-战 1659
-斗 1660
-严 1661
-娟 1662
-逼 1663
-添 1664
-盈 1665
-萝 1666
-莉 1667
-肌 1668
-唐 1669
-兵 1670
-辆 1671
-双 1672
-佛 1673
-傅 1674
-劲 1675
-直 1676
-测 1677
-苏 1678
-迁 1679
-沭 1680
-祥 1681
-婴 1682
-品 1683
-销 1684
-禹 1685
-哲 1686
-嗯 1687
-趟 1688
-拐 1689
-金 1690
-满 1691
-套 1692
-倒 1693
-千 1694
-迎 1695
-淇 1696
-驾 1697
-拟 1698
-良 1699
-揭 1700
-杯 1701
-淄 1702
-睛 1703
-制 1704
-枪 1705
-抢 1706
-狸 1707
-泥 1708
-造 1709
-哇 1710
-羯 1711
-庙 1712
-逃 1713
-朗 1714
-领 1715
-悟 1716
-湛 1717
-贸 1718
-垃 1719
-圾 1720
-软 1721
-莲 1722
-味 1723
-旺 1724
-旦 1725
-潮 1726
-奶 1727
-央 1728
-惜 1729
-续 1730
-咨 1731
-茅 1732
-父 1733
-母 1734
-笔 1735
-封 1736
-同 1737
-黎 1738
-共 1739
-科 1740
-相 1741
-镇 1742
-贤 1743
-宏 1744
-洲 1745
-瓦 1746
-寨 1747
-受 1748
-梦 1749
-呗 1750
-苍 1751
-丘 1752
-避 1753
-孕 1754
-灰 1755
-尤 1756
-击 1757
-腮 1758
-腺 1759
-兆 1760
-坨 1761
-屎 1762
-忧 1763
-草 1764
-赠 1765
-外 1766
-险 1767
-某 1768
-磁 1769
-贷 1770
-反 1771
-罚 1772
-昌 1773
-饰 1774
-辉 1775
-权 1776
-材 1777
-炯 1778
-签 1779
-追 1780
-求 1781
-催 1782
-痛 1783
-盱 1784
-眙 1785
-围 1786
-淘 1787
-幸 1788
-扫 1789
-旭 1790
-切 1791
-磋 1792
-冠 1793
-征 1794
-郎 1795
-骗 1796
-哄 1797
-释 1798
-蒙 1799
-涯 1800
-碧 1801
-斜 1802
-嫁 1803
-幕 1804
-哼 1805
-钢 1806
-碳 1807
-纤 1808
-维 1809
-决 1810
-妻 1811
-未 1812
-绳 1813
-断 1814
-寺 1815
-妙 1816
-伯 1817
-蹲 1818
-宅 1819
-吵 1820
-娃 1821
-兄 1822
-廷 1823
-夺 1824
-社 1825
-示 1826
-按 1827
-饺 1828
-甜 1829
-蜜 1830
-咖 1831
-啡 1832
-优 1833
-逛 1834
-街 1835
-著 1836
-杂 1837
-址 1838
-荷 1839
-塘 1840
-拜 1841
-币 1842
-迟 1843
-货 1844
-粉 1845
-刮 1846
-破 1847
-射 1848
-狂 1849
-苗 1850
-罗 1851
-设 1852
-困 1853
-湘 1854
-潭 1855
-评 1856
-娘 1857
-涉 1858
-采 1859
-芙 1860
-夜 1861
-捷 1862
-斩 1863
-摩 1864
-托 1865
-泳 1866
-琳 1867
-律 1868
-官 1869
-劫 1870
-蛮 1871
-替 1872
-架 1873
-悬 1874
-浮 1875
-窗 1876
-顶 1877
-敦 1878
-善 1879
-哉 1880
-桂 1881
-勇 1882
-荆 1883
-镜 1884
-监 1885
-怕 1886
-呐 1887
-劳 1888
-莱 1889
-狮 1890
-宽 1891
-袋 1892
-囊 1893
-秀 1894
-卸 1895
-链 1896
-嵩 1897
-韭 1898
-葆 1899
-额 1900
-翼 1901
-忙 1902
-瓶 1903
-梅 1904
-堰 1905
-粒 1906
-汤 1907
-谋 1908
-樊 1909
-恨 1910
-愿 1911
-锡 1912
-申 1913
-护 1914
-庄 1915
-临 1916
-源 1917
-环 1918
-境 1919
-礼 1920
-恐 1921
-晒 1922
-虫 1923
-划 1924
-鸣 1925
-怖 1926
-伍 1927
-佰 1928
-岁 1929
-组 1930
-响 1931
-类 1932
-韶 1933
-克 1934
-洛 1935
-玲 1936
-裤 1937
-柔 1938
-疆 1939
-篮 1940
-伟 1941
-扔 1942
-掉 1943
-媒 1944
-涨 1945
-透 1946
-纯 1947
-怀 1948
-坊 1949
-麦 1950
-菠 1951
-养 1952
-晕 1953
-群 1954
-展 1955
-厌 1956
-拒 1957
-单 1958
-静 1959
-刷 1960
-插 1961
-肛 1962
-互 1963
-蘑 1964
-菇 1965
-姑 1966
-桐 1967
-辛 1968
-察 1969
-毫 1970
-质 1971
-差 1972
-翰 1973
-爽 1974
-欣 1975
-议 1976
-铜 1977
-籍 1978
-争 1979
-喆 1980
-孔 1981
-堤 1982
-薇 1983
-茵 1984
-席 1985
-琼 1986
-杠 1987
-衔 1988
-概 1989
-往 1990
-邢 1991
-惠 1992
-烫 1993
-绑 1994
-崇 1995
-帝 1996
-据 1997
-貌 1998
-似 1999
-胸 2000
-罩 2001
-构 2002
-尊 2003
-秘 2004
-它 2005
-详 2006
-悠 2007
-闲 2008
-违 2009
-陆 2010
-割 2011
-绩 2012
-企 2013
-绥 2014
-辐 2015
-舌 2016
-寂 2017
-寞 2018
-宇 2019
-携 2020
-拳 2021
-观 2022
-魏 2023
-郭 2024
-磊 2025
-副 2026
-梁 2027
-斌 2028
-须 2029
-僧 2030
-徒 2031
-季 2032
-灯 2033
-梯 2034
-墙 2035
-付 2036
-坦 2037
-殊 2038
-曹 2039
-操 2040
-捡 2041
-赤 2042
-盗 2043
-废 2044
-蒋 2045
-浙 2046
-食 2047
-咯 2048
-童 2049
-坡 2050
-剪 2051
-唯 2052
-疗 2053
-状 2054
-暴 2055
-缓 2056
-誉 2057
-衰 2058
-宋 2059
-娜 2060
-雄 2061
-谛 2062
-糖 2063
-羽 2064
-棋 2065
-滩 2066
-佳 2067
-臭 2068
-帆 2069
-岳 2070
-疲 2071
-惫 2072
-滴 2073
-倾 2074
-盆 2075
-谷 2076
-施 2077
-晶 2078
-赚 2079
-澡 2080
-遇 2081
-鲁 2082
-祭 2083
-灶 2084
-独 2085
-谈 2086
-承 2087
-蜀 2088
-丰 2089
-归 2090
-辜 2091
-扇 2092
-渴 2093
-羡 2094
-慕 2095
-裸 2096
-宗 2097
-纬 2098
-亦 2099
-儒 2100
-霸 2101
-翡 2102
-翠 2103
-芭 2104
-抱 2105
-歉 2106
-邱 2107
-夏 2108
-隆 2109
-灵 2110
-珍 2111
-浩 2112
-乾 2113
-坤 2114
-培 2115
-训 2116
-压 2117
-偏 2118
-骤 2119
-熙 2120
-葬 2121
-姆 2122
-兽 2123
-筑 2124
-丝 2125
-若 2126
-诡 2127
-异 2128
-侯 2129
-摆 2130
-俗 2131
-缚 2132
-束 2133
-愁 2134
-盟 2135
-却 2136
-显 2137
-肤 2138
-茹 2139
-荣 2140
-增 2141
-宫 2142
-局 2143
-适 2144
-楚 2145
-驻 2146
-纽 2147
-秒 2148
-辣 2149
-虾 2150
-甘 2151
-肃 2152
-粕 2153
-喻 2154
-敬 2155
-谨 2156
-慎 2157
-竭 2158
-止 2159
-际 2160
-寓 2161
-勤 2162
-挫 2163
-泣 2164
-奈 2165
-圭 2166
-焰 2167
-猩 2168
-守 2169
-允 2170
-兔 2171
-篇 2172
-敌 2173
-辕 2174
-猫 2175
-柠 2176
-檬 2177
-橘 2178
-卜 2179
-妓 2180
-既 2181
-闯 2182
-胆 2183
-刁 2184
-竟 2185
-竞 2186
-冯 2187
-陇 2188
-赣 2189
-呆 2190
-滞 2191
-停 2192
-邯 2193
-郸 2194
-域 2195
-徕 2196
-患 2197
-甲 2198
-亡 2199
-鼻 2200
-背 2201
-戴 2202
-幼 2203
-伙 2204
-括 2205
-邓 2206
-谐 2207
-担 2208
-浑 2209
-抖 2210
-耍 2211
-综 2212
-失 2213
-蕾 2214
-鸭 2215
-莘 2216
-选 2217
-糸 2218
-桶 2219
-弃 2220
-暗 2221
-卓 2222
-榜 2223
-拼 2224
-壅 2225
-丈 2226
-锈 2227
-恢 2228
-刺 2229
-嘻 2230
-顾 2231
-投 2232
-晓 2233
-巨 2234
-抽 2235
-档 2236
-乳 2237
-迪 2238
-蠢 2239
-裹 2240
-唤 2241
-焦 2242
-择 2243
-俺 2244
-技 2245
-暧 2246
-昧 2247
-怪 2248
-坛 2249
-眉 2250
-嘉 2251
-逸 2252
-课 2253
-栏 2254
-撬 2255
-框 2256
-液 2257
-凝 2258
-暑 2259
-型 2260
-烘 2261
-簿 2262
-扬 2263
-汁 2264
-诸 2265
-迹 2266
-禁 2267
-株 2268
-泸 2269
-屠 2270
-宰 2271
-忽 2272
-炼 2273
-必 2274
-妆 2275
-飘 2276
-鹿 2277
-敲 2278
-拾 2279
-躺 2280
-歇 2281
-狠 2282
-沾 2283
-畅 2284
-镕 2285
-贪 2286
-污 2287
-斧 2288
-巡 2289
-弹 2290
-盐 2291
-枝 2292
-渝 2293
-壑 2294
-郴 2295
-落 2296
-牡 2297
-钛 2298
-剩 2299
-俄 2300
-抵 2301
-押 2302
-郊 2303
-弱 2304
-授 2305
-蟹 2306
-糕 2307
-败 2308
-各 2309
-伴 2310
-享 2311
-居 2312
-障 2313
-棕 2314
-旁 2315
-屌 2316
-绒 2317
-酸 2318
-隔 2319
-瞄 2320
-俩 2321
-柳 2322
-册 2323
-弊 2324
-逮 2325
-绵 2326
-挣 2327
-闵 2328
-勒 2329
-陶 2330
-寒 2331
-吻 2332
-桃 2333
-悍 2334
-绮 2335
-贞 2336
-疾 2337
-诫 2338
-菌 2339
-则 2340
-谭 2341
-咏 2342
-麟 2343
-棠 2344
-抬 2345
-棺 2346
-均 2347
-纸 2348
-碱 2349
-沧 2350
-董 2351
-挤 2352
-虚 2353
-钠 2354
-胃 2355
-躁 2356
-智 2357
-畔 2358
-墨 2359
-堂 2360
-喔 2361
-宣 2362
-丑 2363
-嚣 2364
-辈 2365
-孤 2366
-鞭 2367
-验 2368
-夕 2369
-印 2370
-欧 2371
-阵 2372
-咋 2373
-驹 2374
-挂 2375
-轿 2376
-拍 2377
-洁 2378
-凑 2379
-蕉 2380
-诱 2381
-惑 2382
-颐 2383
-箭 2384
-樱 2385
-辖 2386
-捕 2387
-炸 2388
-斋 2389
-恒 2390
-沉 2391
-侧 2392
-跌 2393
-暇 2394
-掌 2395
-筷 2396
-彬 2397
-稚 2398
-傲 2399
-腔 2400
-藏 2401
-浦 2402
-瓣 2403
-捆 2404
-卧 2405
-欠 2406
-犀 2407
-甩 2408
-敷 2409
-衍 2410
-谅 2411
-积 2412
-怡 2413
-阁 2414
-趣 2415
-掰 2416
-耽 2417
-蒜 2418
-菱 2419
-葛 2420
-聚 2421
-露 2422
-帐 2423
-紧 2424
-郓 2425
-聘 2426
-桑 2427
-众 2428
-圈 2429
-渡 2430
-鲜 2431
-杜 2432
-甫 2433
-遵 2434
-骚 2435
-吹 2436
-蚊 2437
-塔 2438
-赏 2439
-荒 2440
-欺 2441
-揍 2442
-锁 2443
-恼 2444
-忐 2445
-忑 2446
-输 2447
-描 2448
-触 2449
-糊 2450
-涂 2451
-熊 2452
-妮 2453
-抄 2454
-裙 2455
-塑 2456
-橡 2457
-阜 2458
-獒 2459
-励 2460
-黔 2461
-臣 2462
-憔 2463
-悴 2464
-昂 2465
-党 2466
-沐 2467
-浓 2468
-灾 2469
-捐 2470
-柿 2471
-瑟 2472
-翁 2473
-侨 2474
-督 2475
-振 2476
-鹏 2477
-乒 2478
-乓 2479
-巷 2480
-贡 2481
-祖 2482
-即 2483
-绿 2484
-搭 2485
-配 2486
-骑 2487
-届 2488
-举 2489
-伸 2490
-整 2491
-突 2492
-陌 2493
-糟 2494
-惩 2495
-硕 2496
-茫 2497
-趋 2498
-仁 2499
-钙 2500
-雕 2501
-井 2502
-撒 2503
-岩 2504
-悄 2505
-搁 2506
-浅 2507
-救 2508
-饮 2509
-佩 2510
-赌 2511
-涕 2512
-薯 2513
-令 2514
-泌 2515
-蔬 2516
-批 2517
-攀 2518
-怜 2519
-淮 2520
-寝 2521
-填 2522
-卿 2523
-萱 2524
-寄 2525
-窝 2526
-纳 2527
-洱 2528
-惹 2529
-锦 2530
-浒 2531
-欲 2532
-棉 2533
-箱 2534
-仅 2535
-述 2536
-摸 2537
-纲 2538
-澳 2539
-染 2540
-兼 2541
-岭 2542
-淋 2543
-肿 2544
-旗 2545
-嘞 2546
-乖 2547
-酮 2548
-颠 2549
-覆 2550
-誓 2551
-递 2552
-蛟 2553
-占 2554
-乎 2555
-融 2556
-甸 2557
-幂 2558
-钥 2559
-匙 2560
-酬 2561
-皆 2562
-胎 2563
-腐 2564
-痿 2565
-绣 2566
-枫 2567
-蝴 2568
-蝶 2569
-抛 2570
-撞 2571
-植 2572
-僵 2573
-尸 2574
-巾 2575
-煌 2576
-逊 2577
-引 2578
-兑 2579
-荫 2580
-朔 2581
-丢 2582
-扩 2583
-摄 2584
-龟 2585
-鑫 2586
-谦 2587
-豪 2588
-噬 2589
-眷 2590
-挑 2591
-仲 2592
-穷 2593
-玻 2594
-璃 2595
-岗 2596
-姥 2597
-横 2598
-蚌 2599
-埠 2600
-邀 2601
-蔚 2602
-虹 2603
-降 2604
-疣 2605
-鱿 2606
-喵 2607
-囧 2608
-茶 2609
-猜 2610
-玮 2611
-莎 2612
-冼 2613
-榕 2614
-媛 2615
-瓜 2616
-煮 2617
-耕 2618
-镶 2619
-虽 2620
-驳 2621
-霍 2622
-仗 2623
-窍 2624
-魅 2625
-访 2626
-邻 2627
-抗 2628
-莆 2629
-涵 2630
-筒 2631
-疯 2632
-赖 2633
-豌 2634
-碍 2635
-症 2636
-卤 2637
-翅 2638
-膀 2639
-蓬 2640
-咸 2641
-尚 2642
-瘦 2643
-缸 2644
-爬 2645
-鄂 2646
-塞 2647
-稻 2648
-召 2649
-荡 2650
-桨 2651
-税 2652
-呃 2653
-渠 2654
-骥 2655
-伏 2656
-枥 2657
-邑 2658
-净 2659
-弦 2660
-蔽 2661
-诀 2662
-咳 2663
-嗽 2664
-芯 2665
-储 2666
-缘 2667
-冻 2668
-厨 2669
-鉴 2670
-擦 2671
-棒 2672
-损 2673
-暂 2674
-殖 2675
-焊 2676
-募 2677
-邵 2678
-饶 2679
-梭 2680
-鄙 2681
-骄 2682
-蔡 2683
-辄 2684
-努 2685
-洽 2686
-宙 2687
-鲈 2688
-葫 2689
-芦 2690
-梧 2691
-燎 2692
-缴 2693
-薄 2694
-执 2695
-垫 2696
-靠 2697
-拢 2698
-萧 2699
-醋 2700
-脊 2701
-慰 2702
-攻 2703
-狱 2704
-吝 2705
-啬 2706
-煤 2707
-楞 2708
-脏 2709
-迷 2710
-椒 2711
-侄 2712
-璇 2713
-耐 2714
-庵 2715
-帽 2716
-崎 2717
-峻 2718
-援 2719
-娶 2720
-丫 2721
-犹 2722
-豫 2723
-罪 2724
-恶 2725
-陛 2726
-樟 2727
-截 2728
-巧 2729
-驰 2730
-轨 2731
-继 2732
-葱 2733
-蘸 2734
-汕 2735
-蜘 2736
-蛛 2737
-聋 2738
-俱 2739
-捉 2740
-卢 2741
-骆 2742
-氓 2743
-耶 2744
-仔 2745
-激 2746
-渊 2747
-钾 2748
-暖 2749
-钰 2750
-裁 2751
-判 2752
-略 2753
-墓 2754
-洪 2755
-凌 2756
-符 2757
-壮 2758
-陵 2759
-挥 2760
-夷 2761
-尘 2762
-沪 2763
-榆 2764
-涧 2765
-析 2766
-孝 2767
-弘 2768
-椅 2769
-贴 2770
-蛇 2771
-浣 2772
-镯 2773
-枣 2774
-佐 2775
-柑 2776
-谓 2777
-洞 2778
-漳 2779
-撕 2780
-叉 2781
-诛 2782
-糯 2783
-粽 2784
-碎 2785
-幅 2786
-赘 2787
-浆 2788
-循 2789
-偕 2790
-诙 2791
-阚 2792
-摘 2793
-串 2794
-悉 2795
-蜕 2796
-残 2797
-诅 2798
-祁 2799
-仪 2800
-璐 2801
-瑶 2802
-楠 2803
-崂 2804
-供 2805
-掖 2806
-椎 2807
-铆 2808
-钉 2809
-铐 2810
-镣 2811
-栋 2812
-潇 2813
-抓 2814
-屋 2815
-鸦 2816
-玄 2817
-芜 2818
-钨 2819
-毯 2820
-矿 2821
-缩 2822
-酶 2823
-焕 2824
-埃 2825
-霞 2826
-噢 2827
-韵 2828
-艾 2829
-虐 2830
-俘 2831
-颗 2832
-巩 2833
-牵 2834
-汝 2835
-搅 2836
-廉 2837
-啤 2838
-苑 2839
-辍 2840
-缝 2841
-纫 2842
-膜 2843
-娄 2844
-倩 2845
-魂 2846
-姜 2847
-彻 2848
-扉 2849
-镁 2850
-氢 2851
-铝 2852
-淀 2853
-雹 2854
-妍 2855
-鼎 2856
-碗 2857
-亭 2858
-闽 2859
-献 2860
-耻 2861
-畜 2862
-蚯 2863
-蚓 2864
-杆 2865
-靓 2866
-颖 2867
-瘾 2868
-腿 2869
-咧 2870
-嗦 2871
-忆 2872
-卑 2873
-鸽 2874
-藤 2875
-滑 2876
-蝇 2877
-蚂 2878
-蚁 2879
-迢 2880
-蝎 2881
-斑 2882
-赞 2883
-氧 2884
-姝 2885
-擎 2886
-憋 2887
-屈 2888
-讶 2889
-袜 2890
-吐 2891
-秽 2892
-哑 2893
-蓦 2894
-阑 2895
-珊 2896
-披 2897
-巫 2898
-妖 2899
-坪 2900
-疏 2901
-抒 2902
-炉 2903
-舰 2904
-贱 2905
-搬 2906
-遥 2907
-燃 2908
-咽 2909
-喉 2910
-熔 2911
-婵 2912
-奔 2913
-汗 2914
-蓄 2915
-辰 2916
-肩 2917
-洮 2918
-琅 2919
-径 2920
-廊 2921
-姬 2922
-衬 2923
-雯 2924
-滁 2925
-泗 2926
-筹 2927
-诵 2928
-奴 2929
-跨 2930
-娴 2931
-绯 2932
-惯 2933
-谎 2934
-蹈 2935
-潜 2936
-搂 2937
-逆 2938
-钞 2939
-辅 2940
-凶 2941
-橱 2942
-柜 2943
-婕 2944
-矮 2945
-邹 2946
-嫂 2947
-饼 2948
-撸 2949
-壶 2950
-握 2951
-鸳 2952
-鸯 2953
-寸 2954
-堕 2955
-哀 2956
-械 2957
-蜈 2958
-蚣 2959
-袁 2960
-鸿 2961
-穆 2962
-泊 2963
-衮 2964
-弗 2965
-雷 2966
-谜 2967
-俞 2968
-灏 2969
-毅 2970
-迈 2971
-蜂 2972
-辨 2973
-沂 2974
-灭 2975
-腊 2976
-脍 2977
-炙 2978
-卦 2979
-霄 2980
-扯 2981
-泾 2982
-脂 2983
-肪 2984
-淹 2985
-灌 2986
-辱 2987
-丸 2988
-账 2989
-秤 2990
-褐 2991
-芬 2992
-窖 2993
-慈 2994
-益 2995
-亿 2996
-颈 2997
-糜 2998
-隋 2999
-霉 3000
-署 3001
-狄 3002
-酪 3003
-旋 3004
-蔷 3005
-皱 3006
-纹 3007
-枯 3008
-粤 3009
-拔 3010
-菩 3011
-驱 3012
-咦 3013
-掀 3014
-菊 3015
-涩 3016
-耀 3017
-娥 3018
-奘 3019
-眯 3020
-芊 3021
-绪 3022
-沛 3023
-锐 3024
-姓 3025
-氏 3026
-垂 3027
-迫 3028
-絮 3029
-藕 3030
-捎 3031
-蓓 3032
-沫 3033
-奎 3034
-贩 3035
-泵 3036
-疑 3037
-岐 3038
-拓 3039
-詹 3040
-韦 3041
-粥 3042
-瞎 3043
-层 3044
-寿 3045
-淑 3046
-琦 3047
-履 3048
-痣 3049
-蔓 3050
-延 3051
-措 3052
-氰 3053
-胺 3054
-炭 3055
-鸥 3056
-谊 3057
-宛 3058
-悦 3059
-谣 3060
-茎 3061
-堆 3062
-鲤 3063
-坚 3064
-澎 3065
-溜 3066
-贫 3067
-擅 3068
-锌 3069
-竹 3070
-苟 3071
-磷 3072
-庐 3073
-嵌 3074
-潍 3075
-悚 3076
-岑 3077
-稀 3078
-奋 3079
-呦 3080
-梳 3081
-伐 3082
-芒 3083
-吶 3084
-凡 3085
-臂 3086
-驴 3087
-殿 3088
-雁 3089
-粹 3090
-凋 3091
-葵 3092
-烛 3093
-肾 3094
-尝 3095
-磨 3096
-晏 3097
-甄 3098
-嬛 3099
-盼 3100
-肇 3101
-咬 3102
-洼 3103
-匪 3104
-啰 3105
-硅 3106
-铅 3107
-矛 3108
-盾 3109
-贼 3110
-霜 3111
-螺 3112
-漏 3113
-帕 3114
-杉 3115
-矶 3116
-耗 3117
-责 3118
-靖 3119
-呸 3120
-驶 3121
-吞 3122
-睁 3123
-笼 3124
-茉 3125
-赈 3126
-纱 3127
-艘 3128
-炖 3129
-仿 3130
-瞬 3131
-嵊 3132
-澄 3133
-丞 3134
-摔 3135
-宠 3136
-爪 3137
-笋 3138
-庭 3139
-蜡 3140
-戈 3141
-锻 3142
-粗 3143
-糙 3144
-混 3145
-荚 3146
-曙 3147
-凄 3148
-抚 3149
-瀑 3150
-挖 3151
-掘 3152
-垮 3153
-奢 3154
-侈 3155
-揉 3156
-穹 3157
-钦 3158
-蛙 3159
-荧 3160
-悔 3161
-彦 3162
-忏 3163
-祸 3164
-攒 3165
-慌 3166
-簸 3167
-箕 3168
-繁 3169
-尖 3170
-芋 3171
-铠 3172
-沿 3173
-扮 3174
-隐 3175
-促 3176
-庾 3177
-葡 3178
-萄 3179
-硝 3180
-溶 3181
-淡 3182
-炅 3183
-昕 3184
-尧 3185
-妊 3186
-娠 3187
-曰 3188
-祛 3189
-枚 3190
-卒 3191
-陕 3192
-昭 3193
-龚 3194
-债 3195
-嗓 3196
-陷 3197
-阱 3198
-庞 3199
-盲 3200
-侵 3201
-匣 3202
-愤 3203
-怒 3204
-瞅 3205
-遭 3206
-脉 3207
-馒 3208
-愉 3209
-栗 3210
-鲍 3211
-挎 3212
-匆 3213
-缕 3214
-昵 3215
-鳄 3216
-阙 3217
-坟 3218
-捏 3219
-喽 3220
-雀 3221
-贯 3222
-苇 3223
-鹄 3224
-愈 3225
-裂 3226
-伪 3227
-劣 3228
-歹 3229
-溅 3230
-雌 3231
-猛 3232
-逞 3233
-饥 3234
-愚 3235
-牧 3236
-碰 3237
-帜 3238
-佝 3239
-偻 3240
-讪 3241
-馍 3242
-役 3243
-栈 3244
-唾 3245
-缆 3246
-袄 3247
-闸 3248
-织 3249
-筐 3250
-婉 3251
-昏 3252
-拖 3253
-毙 3254
-咙 3255
-褪 3256
-驼 3257
-壳 3258
-孽 3259
-审 3260
-脖 3261
-恳 3262
-孢 3263
-矫 3264
-臻 3265
-兖 3266
-俏 3267
-棍 3268
-唻 3269
-肘 3270
-俭 3271
-冕 3272
-葩 3273
-佑 3274
-鬓 3275
-柚 3276
-赴 3277
-崖 3278
-塌 3279
-厢 3280
-窈 3281
-窕 3282
-逑 3283
-卷 3284
-拂 3285
-蟑 3286
-螂 3287
-契 3288
-羞 3289
-函 3290
-逐 3291
-拌 3292
-肺 3293
-阻 3294
-纵 3295
-痰 3296
-狙 3297
-惋 3298
-枰 3299
-崽 3300
-胚 3301
-骡 3302
-萎 3303
-泄 3304
-呜 3305
-籁 3306
-濮 3307
-阆 3308
-琵 3309
-琶 3310
-跃 3311
-筝 3312
-勃 3313
-楂 3314
-奉 3315
-础 3316
-吆 3317
-壹 3318
-飙 3319
-虑 3320
-脆 3321
-黛 3322
-栓 3323
-逻 3324
-螃 3325
-轰 3326
-仑 3327
-券 3328
-逢 3329
-疮 3330
-私 3331
-窃 3332
-儋 3333
-泼 3334
-熬 3335
-焚 3336
-梨 3337
-吟 3338
-棱 3339
-稳 3340
-翘 3341
-祠 3342
-遮 3343
-瘤 3344
-稣 3345
-唇 3346
-阖 3347
-堡 3348
-禾 3349
-钗 3350
-爵 3351
-赐 3352
-绕 3353
-粘 3354
-癌 3355
-矜 3356
-虱 3357
-婧 3358
-坝 3359
-菏 3360
-隶 3361
-尺 3362
-滕 3363
-竿 3364
-恰 3365
-喱 3366
-冤 3367
-枉 3368
-叠 3369
-穴 3370
-搏 3371
-窦 3372
-栀 3373
-踪 3374
-昼 3375
-氯 3376
-陋 3377
-铭 3378
-禅 3379
-屑 3380
-巢 3381
-咻 3382
-喇 3383
-叭 3384
-棵 3385
-吊 3386
-诈 3387
-娇 3388
-绘 3389
-圩 3390
-仰 3391
-疙 3392
-瘩 3393
-桦 3394
-妾 3395
-丧 3396
-昊 3397
-湄 3398
-靴 3399
-迭 3400
-劝 3401
-溧 3402
-靡 3403
-梗 3404
-倪 3405
-刍 3406
-芽 3407
-篱 3408
-笆 3409
-漯 3410
-镖 3411
-协 3412
-叙 3413
-汾 3414
-豚 3415
-锷 3416
-瑙 3417
-瑜 3418
-伽 3419
-彰 3420
-扒 3421
-麝 3422
-赔 3423
-焉 3424
-亏 3425
-煅 3426
-翱 3427
-哽 3428
-煦 3429
-喷 3430
-舱 3431
-惨 3432
-哗 3433
-躲 3434
-佘 3435
-憾 3436
-旷 3437
-芹 3438
-簧 3439
-疹 3440
-簇 3441
-羹 3442
-刊 3443
-鹦 3444
-鹉 3445
-狡 3446
-猾 3447
-锯 3448
-呛 3449
-泛 3450
-汶 3451
-毗 3452
-衫 3453
-猕 3454
-祺 3455
-悸 3456
-昙 3457
-莽 3458
-杏 3459
-钮 3460
-叛 3461
-锄 3462
-砸 3463
-囤 3464
-犁 3465
-溃 3466
-疡 3467
-迦 3468
-轲 3469
-噜 3470
-犒 3471
-薰 3472
-薛 3473
-哺 3474
-竖 3475
-氟 3476
-渐 3477
-柒 3478
-贰 3479
-捌 3480
-傍 3481
-抹 3482
-褶 3483
-仇 3484
-偎 3485
-馅 3486
-旱 3487
-渭 3488
-昔 3489
-癣 3490
-挚 3491
-姻 3492
-炽 3493
-嘎 3494
-矢 3495
-汀 3496
-馋 3497
-淌 3498
-莓 3499
-貂 3500
-啧 3501
-茱 3502
-萸 3503
-涌 3504
-臀 3505
-恃 3506
-蒲 3507
-朴 3508
-嘟 3509
-扁 3510
-剂 3511
-歧 3512
-啪 3513
-啵 3514
-匠 3515
-帖 3516
-痒 3517
-睿 3518
-踢 3519
-衿 3520
-叽 3521
-崩 3522
-顽 3523
-嫌 3524
-扛 3525
-浔 3526
-拯 3527
-戊 3528
-戚 3529
-蛔 3530
-醇 3531
-笛 3532
-氛 3533
-沦 3534
-婊 3535
-仍 3536
-镍 3537
-渤 3538
-舶 3539
-哆 3540
-睹 3541
-萤 3542
-弧 3543
-辙 3544
-旯 3545
-纷 3546
-熄 3547
-挽 3548
-帘 3549
-蒸 3550
-橄 3551
-榄 3552
-滥 3553
-掩 3554
-兮 3555
-庸 3556
-玟 3557
-垦 3558
-惟 3559
-朕 3560
-脯 3561
-歪 3562
-吾 3563
-碘 3564
-锰 3565
-矾 3566
-拙 3567
-践 3568
-纠 3569
-赡 3570
-暨 3571
-凳 3572
-雾 3573
-缔 3574
-啫 3575
-毁 3576
-宥 3577
-邛 3578
-崃 3579
-禧 3580
-醛 3581
-滤 3582
-嘀 3583
-缪 3584
-萌 3585
-芥 3586
-胀 3587
-鲨 3588
-腩 3589
-勾 3590
-裳 3591
-雍 3592
-蹭 3593
-匹 3594
-髓 3595
-砍 3596
-孰 3597
-辩 3598
-唰 3599
-慷 3600
-慨 3601
-畏 3602
-坠 3603
-钝 3604
-箫 3605
-愧 3606
-劈 3607
-嘶 3608
-粮 3609
-轼 3610
-蟒 3611
-翊 3612
-澧 3613
-揽 3614
-烹 3615
-饪 3616
-踏 3617
-弛 3618
-婢 3619
-奸 3620
-掏 3621
-泓 3622
-袖 3623
-笈 3624
-刑 3625
-俑 3626
-浇 3627
-骊 3628
-蛀 3629
-蚤 3630
-杵 3631
-兹 3632
-晰 3633
-癫 3634
-痫 3635
-逝 3636
-炬 3637
-讼 3638
-陂 3639
-蚕 3640
-绸 3641
-槽 3642
-纨 3643
-牢 3644
-晃 3645
-窄 3646
-蒂 3647
-湃 3648
-硫 3649
-眨 3650
-耸 3651
-浠 3652
-梵 3653
-纺 3654
-贾 3655
-膨 3656
-阀 3657
-堀 3658
-扭 3659
-捂 3660
-扑 3661
-椭 3662
-鳟 3663
-丙 3664
-烯 3665
-冈 3666
-衷 3667
-牟 3668
-郫 3669
-畴 3670
-腥 3671
-亩 3672
-淤 3673
-禄 3674
-倘 3675
-烷 3676
-仆 3677
-刨 3678
-炜 3679
-挨 3680
-鳅 3681
-奚 3682
-峪 3683
-呻 3684
-佣 3685
-渔 3686
-肢 3687
-霏 3688
-旨 3689
-爰 3690
-吨 3691
-珑 3692
-隽 3693
-橙 3694
-箍 3695
-岚 3696
-啸 3697
-倌 3698
-剃 3699
-御 3700
-沸 3701
-棘 3702
-瘫 3703
-痪 3704
-仕 3705
-闺 3706
-炳 3707
-乏 3708
-拱 3709
-墅 3710
-铢 3711
-痤 3712
-琥 3713
-珈 3714
-荟 3715
-翩 3716
-搓 3717
-阮 3718
-芸 3719
-抠 3720
-弓 3721
-锣 3722
-赫 3723
-挡 3724
-侃 3725
-诶 3726
-沽 3727
-绫 3728
-濑 3729
-龈 3730
-乞 3731
-丐 3732
-宴 3733
-馁 3734
-牲 3735
-闰 3736
-亢 3737
-辫 3738
-铲 3739
-嫦 3740
-卵 3741
-佚 3742
-谬 3743
-倡 3744
-抑 3745
-赋 3746
-跆 3747
-削 3748
-氮 3749
-嫩 3750
-噻 3751
-蜗 3752
-鹂 3753
-靶 3754
-妥 3755
-衢 3756
-腻 3757
-砖 3758
-翎 3759
-拈 3760
-卉 3761
-皂 3762
-曦 3763
-荔 3764
-晤 3765
-曜 3766
-趵 3767
-纣 3768
-捞 3769
-蕲 3770
-猿 3771
-榈 3772
-憎 3773
-媚 3774
-绞 3775
-峙 3776
-饲 3777
-瑾 3778
-寡 3779
-釜 3780
-凸 3781
-凹 3782
-嫉 3783
-妒 3784
-婪 3785
-驸 3786
-荤 3787
-弥 3788
-蹦 3789
-驮 3790
-汞 3791
-唠 3792
-叨 3793
-袈 3794
-裟 3795
-毽 3796
-蔗 3797
-蹄 3798
-犍 3799
-珞 3800
-谚 3801
-煎 3802
-腋 3803
-瞳 3804
-丛 3805
-挪 3806
-榴 3807
-钩 3808
-梓 3809
-骁 3810
-烙 3811
-舜 3812
-暮 3813
-擀 3814
-兜 3815
-癜 3816
-姗 3817
-藜 3818
-擒 3819
-歼 3820
-冉 3821
-倚 3822
-漱 3823
-嫣 3824
-椰 3825
-隘 3826
-掐 3827
-栾 3828
-巍 3829
-咔 3830
-稽 3831
-惆 3832
-怅 3833
-镑 3834
-娲 3835
-芷 3836
-藻 3837
-伺 3838
-忌 3839
-桔 3840
-绅 3841
-坂 3842
-澜 3843
-嚓 3844
-苔 3845
-诣 3846
-倔 3847
-酿 3848
-槟 3849
-榔 3850
-粪 3851
-渺 3852
-馗 3853
-峨 3854
-碚 3855
-阎 3856
-巅 3857
-颊 3858
-戬 3859
-吒 3860
-鸵 3861
-岂 3862
-廖 3863
-娅 3864
-旬 3865
-猥 3866
-琐 3867
-扈 3868
-滔 3869
-枷 3870
-崴 3871
-捣 3872
-泻 3873
-甙 3874
-俯 3875
-撑 3876
-芮 3877
-舆 3878
-邂 3879
-逅 3880
-宪 3881
-晖 3882
-岔 3883
-哒 3884
-酵 3885
-痔 3886
-苓 3887
-捶 3888
-睫 3889
-裕 3890
-彤 3891
-潢 3892
-酉 3893
-聂 3894
-氨 3895
-嗷 3896
-皎 3897
-焖 3898
-袭 3899
-惦 3900
-惘 3901
-隙 3902
-彝 3903
-鞘 3904
-厄 3905
-殷 3906
-罕 3907
-嚏 3908
-拇 3909
-尹 3910
-蔻 3911
-颂 3912
-皖 3913
-霖 3914
-屉 3915
-崛 3916
-砣 3917
-穗 3918
-枸 3919
-杞 3920
-竣 3921
-勋 3922
-坍 3923
-溢 3924
-廓 3925
-煽 3926
-囚 3927
-涪 3928
-墩 3929
-琢 3930
-胳 3931
-膊 3932
-彼 3933
-陀 3934
-汹 3935
-柱 3936
-颁 3937
-闫 3938
-熠 3939
-叹 3940
-婿 3941
-娩 3942
-藓 3943
-岷 3944
-婺 3945
-桓 3946
-赁 3947
-罢 3948
-姊 3949
-瓢 3950
-桩 3951
-淫 3952
-堪 3953
-艰 3954
-枢 3955
-枞 3956
-晗 3957
-泷 3958
-逍 3959
-筱 3960
-烽 3961
-渍 3962
-蒿 3963
-殴 3964
-玖 3965
-罐 3966
-剿 3967
-喀 3968
-磕 3969
-铵 3970
-蕊 3971
-篓 3972
-痞 3973
-磅 3974
-礴 3975
-磐 3976
-拘 3977
-瘙 3978
-惕 3979
-孜 3980
-杖 3981
-撇 3982
-敖 3983
-踩 3984
-刹 3985
-蹿 3986
-坎 3987
-氦 3988
-汨 3989
-垣 3990
-垢 3991
-胁 3992
-趴 3993
-苷 3994
-镒 3995
-幢 3996
-鞠 3997
-逾 3998
-鬃 3999
-尉 4000
-韧 4001
-锤 4002
-嘘 4003
-呷 4004
-噎 4005
-煲 4006
-恍 4007
-粱 4008
-亳 4009
-鳞 4010
-懦 4011
-酚 4012
-酞 4013
-哨 4014
-祀 4015
-刃 4016
-蕴 4017
-晟 4018
-菀 4019
-甬 4020
-鼾 4021
-鳖 4022
-螳 4023
-稼 4024
-栽 4025
-蝗 4026
-颌 4027
-咀 4028
-掠 4029
-嘱 4030
-甚 4031
-菅 4032
-奂 4033
-讽 4034
-秸 4035
-釉 4036
-坞 4037
-雇 4038
-绢 4039
-捧 4040
-狈 4041
-桀 4042
-骜 4043
-摊 4044
-臆 4045
-竺 4046
-栅 4047
-贬 4048
-飒 4049
-浸 4050
-噩 4051
-晾 4052
-绐 4053
-殡 4054
-挠 4055
-於 4056
-茁 4057
-瞪 4058
-窠 4059
-汰 4060
-魁 4061
-忒 4062
-璋 4063
-怠 4064
-莺 4065
-冶 4066
-绰 4067
-邈 4068
-圻 4069
-湮 4070
-亨 4071
-躬 4072
-砂 4073
-鹭 4074
-浊 4075
-楹 4076
-珉 4077
-撵 4078
-筏 4079
-荨 4080
-鳝 4081
-沥 4082
-邳 4083
-殉 4084
-憨 4085
-啼 4086
-熏 4087
-蜃 4088
-毋 4089
-彗 4090
-噪 4091
-绛 4092
-祟 4093
-蝙 4094
-蝠 4095
-漆 4096
-酰 4097
-锑 4098
-栖 4099
-肆 4100
-邕 4101
-弋 4102
-绽 4103
-嚼 4104
-霹 4105
-雳 4106
-谍 4107
-恹 4108
-怏 4109
-倦 4110
-轶 4111
-曛 4112
-疚 4113
-棚 4114
-漕 4115
-浃 4116
-勘 4117
-暄 4118
-趁 4119
-斥 4120
-苞 4121
-膳 4122
-赎 4123
-崭 4124
-笙 4125
-摁 4126
-嗅 4127
-瞒 4128
-舵 4129
-铸 4130
-咫 4131
-涅 4132
-瘪 4133
-潼 4134
-粑 4135
-漾 4136
-噶 4137
-鸠 4138
-铉 4139
-豹 4140
-遛 4141
-襟 4142
-壤 4143
-甭 4144
-吮 4145
-耒 4146
-钊 4147
-泞 4148
-拦 4149
-昱 4150
-腑 4151
-惧 4152
-韬 4153
-焗 4154
-窘 4155
-喳 4156
-溏 4157
-鲛 4158
-慵 4159
-菁 4160
-攥 4161
-埔 4162
-呕 4163
-蓑 4164
-笠 4165
-孑 4166
-咕 4167
-觐 4168
-漓 4169
-碾 4170
-浜 4171
-嬉 4172
-迂 4173
-笃 4174
-勉 4175
-锥 4176
-篷 4177
-亥 4178
-龌 4179
-龊 4180
-煞 4181
-蓟 4182
-皓 4183
-惰 4184
-勺 4185
-缨 4186
-峥 4187
-苯 4188
-豁 4189
-颓 4190
-拽 4191
-啄 4192
-麒 4193
-雎 4194
-鲢 4195
-睬 4196
-渣 4197
-唔 4198
-桧 4199
-癞 4200
-蛤 4201
-蟆 4202
-撩 4203
-酯 4204
-戳 4205
-舔 4206
-孺 4207
-怂 4208
-恿 4209
-臃 4210
-戟 4211
-惭 4212
-耿 4213
-徵 4214
-柬 4215
-朽 4216
-磺 4217
-媲 4218
-懿 4219
-悼 4220
-绎 4221
-缅 4222
-茜 4223
-瞻 4224
-炀 4225
-脓 4226
-罄 4227
-秃 4228
-拎 4229
-譬 4230
-榉 4231
-拭 4232
-玥 4233
-崆 4234
-峒 4235
-胛 4236
-糗 4237
-佗 4238
-佬 4239
-袍 4240
-炊 4241
-仞 4242
-霎 4243
-掺 4244
-匀 4245
-姹 4246
-妯 4247
-娌 4248
-帷 4249
-岢 4250
-柄 4251
-阪 4252
-玺 4253
-窑 4254
-肽 4255
-涡 4256
-窟 4257
-阉 4258
-硼 4259
-蛳 4260
-呤 4261
-砚 4262
-偌 4263
-贿 4264
-芗 4265
-蹊 4266
-跷 4267
-雏 4268
-膝 4269
-嗜 4270
-扦 4271
-涟 4272
-殆 4273
-郡 4274
-洵 4275
-酋 4276
-匡 4277
-胤 4278
-撤 4279
-辗 4280
-冀 4281
-捺 4282
-吏 4283
-衩 4284
-腕 4285
-灸 4286
-绔 4287
-瓯 4288
-蜻 4289
-蜓 4290
-窜 4291
-躯 4292
-髦 4293
-诏 4294
-缄 4295
-筠 4296
-沌 4297
-酐 4298
-皋 4299
-隧 4300
-鹊 4301
-傀 4302
-儡 4303
-诲 4304
-嘏 4305
-寅 4306
-骇 4307
-喧 4308
-癀 4309
-瑚 4310
-碉 4311
-羔 4312
-掂 4313
-痹 4314
-孚 4315
-绡 4316
-馊 4317
-虏 4318
-悖 4319
-漪 4320
-琉 4321
-缉 4322
-冥 4323
-饯 4324
-蔺 4325
-瘆 4326
-榨 4327
-盯 4328
-鄞 4329
-妨 4330
-哐 4331
-寇 4332
-鹃 4333
-卞 4334
-喘 4335
-藩 4336
-踹 4337
-粟 4338
-陨 4339
-遣 4340
-鳌 4341
-烨 4342
-抉 4343
-臧 4344
-墉 4345
-疽 4346
-拷 4347
-赃 4348
-哮 4349
-馥 4350
-砰 4351
-拗 4352
-汐 4353
-矣 4354
-沅 4355
-裴 4356
-阐 4357
-蟋 4358
-蟀 4359
-蚀 4360
-恁 4361
-恙 4362
-蝉 4363
-荀 4364
-彧 4365
-銮 4366
-侮 4367
-驿 4368
-婶 4369
-檀 4370
-哩 4371
-镐 4372
-轴 4373
-扳 4374
-飓 4375
-麓 4376
-牺 4377
-垛 4378
-稞 4379
-桴 4380
-痧 4381
-揣 4382
-殇 4383
-邬 4384
-撅 4385
-邸 4386
-鼬 4387
-剥 4388
-胥 4389
-撼 4390
-溟 4391
-鄱 4392
-鲫 4393
-觅 4394
-犊 4395
-恕 4396
-铂 4397
-褔 4398
-淼 4399
-骝 4400
-藉 4401
-裔 4402
-痨 4403
-颤 4404
-尴 4405
-尬 4406
-癖 4407
-拄 4408
-蠕 4409
-虻 4410
-迄 4411
-攸 4412
-浚 4413
-盔 4414
-肮 4415
-侬 4416
-锏 4417
-憧 4418
-憬 4419
-镰 4420
-懈 4421
-挟 4422
-缤 4423
-涎 4424
-睾 4425
-惶 4426
-褚 4427
-藐 4428
-眺 4429
-艇 4430
-昀 4431
-妄 4432
-祗 4433
-壬 4434
-浯 4435
-衲 4436
-來 4437
-黯 4438
-芩 4439
-敞 4440
-绊 4441
-娣 4442
-掷 4443
-茯 4444
-琍 4445
-蛹 4446
-钧 4447
-瘘 4448
-蜥 4449
-蜴 4450
-唬 4451
-驭 4452
-阂 4453
-诃 4454
-疟 4455
-潦 4456
-谀 4457
-肱 4458
-黏 4459
-甥 4460
-眶 4461
-秩 4462
-庇 4463
-钏 4464
-咝 4465
-肴 4466
-宸 4467
-湟 4468
-沣 4469
-煊 4470
-盂 4471
-弈 4472
-瞩 4473
-聆 4474
-疥 4475
-腼 4476
-腆 4477
-胭 4478
-匕 4479
-讳 4480
-戮 4481
-茧 4482
-趾 4483
-亵 4484
-吖 4485
-漩 4486
-逵 4487
-寰 4488
-滇 4489
-渎 4490
-寮 4491
-嘁 4492
-珂 4493
-珀 4494
-稠 4495
-羌 4496
-徘 4497
-徊 4498
-苛 4499
-蕨 4500
-薏 4501
-苡 4502
-戌 4503
-卯 4504
-馈 4505
-溥 4506
-熹 4507
-屡 4508
-巳 4509
-璜 4510
-铮 4511
-踊 4512
-锚 4513
-濠 4514
-噫 4515
-怦 4516
-蓥 4517
-碌 4518
-霓 4519
-牦 4520
-妤 4521
-屹 4522
-缈 4523
-蹉 4524
-驷 4525
-菡 4526
-谔 4527
-琛 4528
-吡 4529
-喹 4530
-呲 4531
-溺 4532
-鳗 4533
-慑 4534
-秆 4535
-骋 4536
-脐 4537
-涤 4538
-荞 4539
-淅 4540
-罘 4541
-焱 4542
-孵 4543
-斟 4544
-酌 4545
-痊 4546
-秉 4547
-砌 4548
-瘁 4549
-胱 4550
-笫 4551
-燮 4552
-衅 4553
-腱 4554
-垒 4555
-锟 4556
-缀 4557
-疵 4558
-墟 4559
-盏 4560
-舂 4561
-侗 4562
-琨 4563
-唧 4564
-怆 4565
-沮 4566
-敛 4567
-瑕 4568
-奠 4569
-汴 4570
-衙 4571
-歆 4572
-嘹 4573
-饽 4574
-拧 4575
-濒 4576
-锭 4577
-嬴 4578
-吱 4579
-靳 4580
-眸 4581
-渲 4582
-睦 4583
-蝼 4584
-瞿 4585
-剁 4586
-紊 4587
-翟 4588
-攘 4589
-蹂 4590
-躏 4591
-淞 4592
-跎 4593
-侍 4594
-铛 4595
-绷 4596
-仟 4597
-瀚 4598
-赉 4599
-俪 4600
-魄 4601
-吼 4602
-酗 4603
-嚒 4604
-彷 4605
-徨 4606
-煜 4607
-曝 4608
-嗑 4609
-俅 4610
-嵘 4611
-隍 4612
-唆 4613
-郜 4614
-栩 4615
-尻 4616
-咗 4617
-茗 4618
-疱 4619
-斐 4620
-菘 4621
-芎 4622
-帼 4623
-枭 4624
-矩 4625
-仨 4626
-幄 4627
-鲸 4628
-猬 4629
-梢 4630
-槐 4631
-璧 4632
-坷 4633
-逯 4634
-踝 4635
-濡 4636
-樵 4637
-肓 4638
-劵 4639
-羚 4640
-髫 4641
-笄 4642
-俾 4643
-匿 4644
-帛 4645
-孀 4646
-焙 4647
-瘟 4648
-籽 4649
-萦 4650
-灼 4651
-箴 4652
-筵 4653
-窒 4654
-裆 4655
-旎 4656
-砝 4657
-妲 4658
-恺 4659
-覃 4660
-寐 4661
-酝 4662
-啃 4663
-塬 4664
-醴 4665
-蜿 4666
-蜒 4667
-愣 4668
-恤 4669
-撂 4670
-瘸 4671
-檐 4672
-琰 4673
-狒 4674
-摧 4675
-诠 4676
-孪 4677
-嘚 4678
-鼹 4679
-囡 4680
-茴 4681
-噤 4682
-僻 4683
-钕 4684
-锴 4685
-渗 4686
-嗫 4687
-撮 4688
-缭 4689
-粼 4690
-咄 4691
-挝 4692
-蛾 4693
-恪 4694
-皙 4695
-莒 4696
-叼 4697
-诽 4698
-妩 4699
-叱 4700
-咤 4701
-挞 4702
-萼 4703
-饵 4704
-澹 4705
-惺 4706
-呶 4707
-铤 4708
-佟 4709
-丕 4710
-靛 4711
-伶 4712
-涣 4713
-桢 4714
-狭 4715
-卅 4716
-蟠 4717
-蟾 4718
-朦 4719
-胧 4720
-咆 4721
-滦 4722
-岖 4723
-篙 4724
-痍 4725
-胰 4726
-谏 4727
-坳 4728
-樯 4729
-橹 4730
-孬 4731
-潴 4732
-厥 4733
-椐 4734
-谩 4735
-恬 4736
-琬 4737
-遁 4738
-褥 4739
-咎 4740
-羁 4741
-苣 4742
-殁 4743
-懵 4744
-褒 4745
-蜚 4746
-蛊 4747
-筛 4748
-耙 4749
-耨 4750
-嬷 4751
-驯 4752
-赅 4753
-畲 4754
-滢 4755
-伎 4756
-庹 4757
-踉 4758
-戎 4759
-膛 4760
-嗡 4761
-吔 4762
-唏 4763
-喏 4764
-哧 4765
-缇 4766
-蚝 4767
-璀 4768
-璨 4769
-捅 4770
-妁 4771
-曳 4772
-吩 4773
-咐 4774
-罂 4775
-垌 4776
-揪 4777
-壕 4778
-跺 4779
-辘 4780
-轳 4781
-噔 4782
-斓 4783
-厮 4784
-叁 4785
-仄 4786
-沼 4787
-鸢 4788
-醪 4789
-郢 4790
-圃 4791
-碜 4792
-鲅 4793
-嚯 4794
-淳 4795
-迩 4796
-诋 4797
-鬟 4798
-汲 4799
-艮 4800
-跤 4801
-麋 4802
-橇 4803
-悱 4804
-恻 4805
-啷 4806
-惮 4807
-樨 4808
-毓 4809
-裱 4810
-堇 4811
-埸 4812
-叵 4813
-腚 4814
-畀 4815
-钼 4816
-赦 4817
-悯 4818
-谴 4819
-稷 4820
-嘢 4821
-盎 4822
-跶 4823
-窥 4824
-瑄 4825
-谤 4826
-柘 4827
-垄 4828
-蠡 4829
-邝 4830
-娆 4831
-俐 4832
-铷 4833
-肋 4834
-涿 4835
-俎 4836
-捜 4837
-罡 4838
-嗝 4839
-唛 4840
-酣 4841
-鹬 4842
-瑀 4843
-帚 4844
-镭 4845
-搽 4846
-钣 4847
-蜇 4848
-嗞 4849
-颉 4850
-耘 4851
-忡 4852
-噼 4853
-睐 4854
-簋 4855
-镚 4856
-朐 4857
-戛 4858
-扪 4859
-鹩 4860
-稹 4861
-嗣 4862
-睇 4863
-弩 4864
-侥 4865
-绚 4866
-虔 4867
-溴 4868
-毂 4869
-漉 4870
-郧 4871
-杈 4872
-埭 4873
-哝 4874
-纾 4875
-箔 4876
-蚍 4877
-呋 4878
-喃 4879
-旌 4880
-袅 4881
-嫡 4882
-2 4883
-睢 4884
-榭 4885
-濉 4886
-雉 4887
-糍 4888
-谙 4889
-坻 4890
-遨 4891
-囔 4892
-鹜 4893
-垩 4894
-嵋 4895
-葑 4896
-叻 4897
-剌 4898
-铀 4899
-鲟 4900
-珏 4901
-唑 4902
-拴 4903
-乍 4904
-镊 4905
-歩 4906
-姘 4907
-戍 4908
-娈 4909
-槿 4910
-魇 4911
-叩 4912
-啾 4913
-腈 4914
-骞 4915
-殃 4916
-髋 4917
-嶙 4918
-璟 4919
-嚷 4920
-鹳 4921
-嗬 4922
-梆 4923
-晁 4924
-龛 4925
-嚎 4926
-熨 4927
-倭 4928
-峦 4929
-蜍 4930
-桉 4931
-齁 4932
-搀 4933
-铬 4934
-刽 4935
-謝 4936
-沒 4937
-簪 4938
-邺 4939
-嵬 4940
-馄 4941
-饨 4942
-蜢 4943
-嗒 4944
-芨 4945
-弶 4946
-晞 4947
-搔 4948
-昴 4949
-夙 4950
-徙 4951
-霾 4952
-嗖 4953
-碴 4954
-秧 4955
-芍 4956
-匝 4957
-泫 4958
-琯 4959
-扼 4960
-砒 4961
-栎 4962
-卟 4963
-琊 4964
-怯 4965
-侩 4966
-峯 4967
-忿 4968
-藁 4969
-蹼 4970
-毡 4971
-埤 4972
-膘 4973
-噗 4974
-阕 4975
-嘭 4976
-椿 4977
-涸 4978
-祯 4979
-芵 4980
-螨 4981
-寥 4982
-梶 4983
-嘈 4984
-泠 4985
-侏 4986
-棂 4987
-缶 4988
-捋 4989
-钜 4990
-璞 4991
-媞 4992
-唢 4993
-邰 4994
-蚱 4995
-薜 4996
-牒 4997
-缥 4998
-咿 4999
-遐 5000
-蕙 5001
-惬 5002
-惚 5003
-硚 5004
-麽 5005
-踌 5006
-褂 5007
-蜉 5008
-蝣 5009
-腌 5010
-熘 5011
-缮 5012
-锢 5013
-犽 5014
-蹬 5015
-皈 5016
-剔 5017
-芪 5018
-妪 5019
-钇 5020
-仃 5021
-荏 5022
-苒 5023
-塾 5024
-阡 5025
-瑨 5026
-冢 5027
-匈 5028
-庶 5029
-荃 5030
-茬 5031
-妗 5032
-暹 5033
-犷 5034
-嵴 5035
-鳃 5036
-羲 5037
-岱 5038
-烩 5039
-勐 5040
-霁 5041
-厝 5042
-飚 5043
-瀛 5044
-炕 5045
-桅 5046
-垓 5047
-晌 5048
-黒 5049
-蚩 5050
-夔 5051
-垚 5052
-烊 5053
-眀 5054
-荼 5055
-蘼 5056
-尅 5057
-舫 5058
-拣 5059
-蹋 5060
-劭 5061
-耆 5062
-陡 5063
-樽 5064
-谒 5065
-觞 5066
-箩 5067
-槛 5068
-傈 5069
-僳 5070
-爻 5071
-皑 5072
-滘 5073
-嬅 5074
-丶 5075
-邋 5076
-遢 5077
-讴 5078
-隅 5079
-邃 5080
-谑 5081
-哔 5082
-矬 5083
-姣 5084
-凛 5085
-冽 5086
-殒 5087
-眈 5088
-鹧 5089
-鸪 5090
-飕 5091
-亘 5092
-篝 5093
-嘅 5094
-乜 5095
-黜 5096
-颇 5097
-鄄 5098
-蔫 5099
-贻 5100
-猝 5101
-绌 5102
-芈 5103
-隼 5104
-戆 5105
-鹫 5106
-霑 5107
-宕 5108
-凇 5109
-铨 5110
-町 5111
-礁 5112
-蕃 5113
-淖 5114
-搐 5115
-饴 5116
-榛 5117
-晔 5118
-祢 5119
-酥 5120
-丨 5121
-赂 5122
-噘 5123
-黍 5124
-幌 5125
-骅 5126
-黝 5127
-帧 5128
-胯 5129
-埙 5130
-敕 5131
-涓 5132
-掣 5133
-圪 5134
-榻 5135
-濛 5136
-擞 5137
-篡 5138
-榷 5139
-亟 5140
-渌 5141
-锹 5142
-啐 5143
-捍 5144
-嘣 5145
-跻 5146
-桠 5147
-贮 5148
-蛰 5149
-猖 5150
-骸 5151
-溉 5152
-铎 5153
-吁 5154
-溯 5155
-踞 5156
-俨 5157
-茌 5158
-蒯 5159
-篆 5160
-膺 5161
-垭 5162
-匮 5163
-撰 5164
-擂 5165
-倜 5166
-傥 5167
-蔑 5168
-弼 5169
-珮 5170
-颢 5171
-钿 5172
-迸 5173
-凿 5174
-湫 5175
-焯 5176
-硒 5177
-畈 5178
-觑 5179
-揶 5180
-禀 5181
-宦 5182
-杷 5183
-讷 5184
-踮 5185
-掳 5186
-窿 5187
-捻 5188
-褴 5189
-褛 5190
-瑛 5191
-胫 5192
-喋 5193
-沓 5194
-汛 5195
-掴 5196
-魉 5197
-馀 5198
-隗 5199
-咘 5200
-呱 5201
-獭 5202
-畊 5203
-莜 5204
-祐 5205
-轧 5206
-魍 5207
-昶 5208
-诓 5209
-囗 5210
-莠 5211
-岌 5212
-潸 5213
-涞 5214
-綦 5215
-畸 5216
-阄 5217
-遏 5218
-啶 5219
-冇 5220
-懋 5221
-煨 5222
-羱 5223
-诟 5224
-枳 5225
-鲶 5226
-燊 5227
-猷 5228
-铄 5229
-缰 5230
-搪 5231
-赊 5232
-诩 5233
-佼 5234
-钵 5235
-谌 5236
-嬗 5237
-砥 5238
-砺 5239
-觊 5240
-觎 5241
-颅 5242
-怵 5243
-疸 5244
-锆 5245
-缢 5246
-棣 5247
-蛎 5248
-鄯 5249
-茸 5250
-谶 5251
-蹶 5252
-侑 5253
-滂 5254
-襁 5255
-褓 5256
-杳 5257
-臊 5258
-摒 5259
-袂 5260
-掸 5261
-鹞 5262
-忱 5263
-湉 5264
-汩 5265
-剽 5266
-槌 5267
-塍 5268
-喟 5269
-讹 5270
-抡 5271
-烃 5272
-咁 5273
-珺 5274
-槎 5275
-砼 5276
-泯 5277
-泮 5278
-遴 5279
-匾 5280
-沏 5281
-悌 5282
-麾 5283
-垡 5284
-鏖 5285
-垅 5286
-斛 5287
-镂 5288
-骷 5289
-髅 5290
-豺 5291
-诿 5292
-狰 5293
-狞 5294
-泱 5295
-榫 5296
-嗤 5297
-瞥 5298
-揄 5299
-哌 5300
-婀 5301
-恸 5302
-蛐 5303
-镀 5304
-霈 5305
-钒 5306
-踱 5307
-淆 5308
-薹 5309
-纭 5310
-瘠 5311
-戾 5312
-夭 5313
-铰 5314
-渚 5315
-犇 5316
-舀 5317
-傣 5318
-獗 5319
-瞭 5320
-兢 5321
-犟 5322
-袒 5323
-铖 5324
-颚 5325
-徜 5326
-徉 5327
-囍 5328
-酆 5329
-铡 5330
-睽 5331
-裨 5332
-饕 5333
-躇 5334
-噱 5335
-赓 5336
-懊 5337
-蟊 5338
-趸 5339
-鄢 5340
-埝 5341
-椟 5342
-粳 5343
-跛 5344
-莴 5345
-娉 5346
-嗄 5347
-邙 5348
-渑 5349
-佶 5350
-颍 5351
-溆 5352
-诧 5353
-抨 5354
-憷 5355
-涠 5356
-痼 5357
-砀 5358
-剐 5359
-缙 5360
-鞑 5361
-坭 5362
-烬 5363
-唁 5364
-臼 5365
-瓮 5366
-袱 5367
-珩 5368
-蝌 5369
-蚪 5370
-诬 5371
-迥 5372
-楸 5373
-皿 5374
-蜷 5375
-遑 5376
-啖 5377
-篪 5378
-崮 5379
-讧 5380
-盹 5381
-瞑 5382
-鲳 5383
-谟 5384
-拮 5385
-琏 5386
-瞰 5387
-憩 5388
-馏 5389
-炷 5390
-眩 5391
-羿 5392
-洙 5393
-珲 5394
-愫 5395
-佯 5396
-舸 5397
-祎 5398
-旮 5399
-翌 5400
-畿 5401
-桎 5402
-梏 5403
-钳 5404
-鳍 5405
-犸 5406
-祉 5407
-缜 5408
-硌 5409
-殓 5410
-砾 5411
-酩 5412
-酊 5413
-兀 5414
-矸 5415
-髙 5416
-疝 5417
-膑 5418
-哂 5419
-僚 5420
-耷 5421
-窨 5422
-孳 5423
-鲠 5424
-淝 5425
-搡 5426
-伢 5427
-鲷 5428
-谕 5429
-頫 5430
-泺 5431
-谧 5432
-煳 5433
-萁 5434
-馕 5435
-鹌 5436
-鹑 5437
-钴 5438
-埇 5439
-摈 5440
-踵 5441
-冗 5442
-铣 5443
-萃 5444
-忤 5445
-揩 5446
-铧 5447
-矗 5448
-闾 5449
-柞 5450
-貉 5451
-撺 5452
-掇 5453
-灞 5454
-醍 5455
-痱 5456
-粲 5457
-糠 5458
-讣 5459
-蹴 5460
-茆 5461
-螈 5462
-旻 5463
-蔼 5464
-咣 5465
-麸 5466
-涝 5467
-渥 5468
-垤 5469
-咭 5470
-玳 5471
-瑁 5472
-郏 5473
-纂 5474
-扞 5475
-峭 5476
-铩 5477
-锨 5478
-坩 5479
-埚 5480
-瑭 5481
-札 5482
-舛 5483
-臬 5484
-郯 5485
-晦 5486
-耄 5487
-耋 5488
-俚 5489
-鲭 5490
-柩 5491
-黟 5492
-骼 5493
-蛆 5494
-跋 5495
-俸 5496
-幡 5497
-愕 5498
-噙 5499
-峋 5500
-厩 5501
-夯 5502
-擢 5503
-枋 5504
-葳 5505
-偃 5506
-赝 5507
-昝 5508
-镉 5509
-嫔 5510
-潋 5511
-娓 5512
-郅 5513
-瘀 5514
-奄 5515
-荇 5516
-咂 5517
-痉 5518
-挛 5519
-祚 5520
-庖 5521
-纰 5522
-簌 5523
-淬 5524
-掮 5525
-俟 5526
-臾 5527
-雒 5528
-吋 5529
-颧 5530
-嗔 5531
-诘 5532
-焘 5533
-獾 5534
-氤 5535
-氲 5536
-鲲 5537
-麂 5538
-罹 5539
-澍 5540
-镳 5541
-囱 5542
-玷 5543
-嗳 5544
-擘 5545
-濂 5546
-逡 5547
-骛 5548
-镔 5549
-湍 5550
-讥 5551
-蹁 5552
-跹 5553
-淦 5554
-骰 5555
-疃 5556
-腓 5557
-嵇 5558
-怄 5559
-谯 5560
-啕 5561
-坯 5562
-钎 5563
-锒 5564
-伉 5565
-佻 5566
-腴 5567
-怼 5568
-浐 5569
-摹 5570
-僮 5571
-芾 5572
-矍 5573
-泔 5574
-蚬 5575
-屐 5576
-翕 5577
-唿 5578
-苋 5579
-氪 5580
-楔 5581
-莪 5582
-掬 5583
-舷 5584
-骐 5585
-嗲 5586
-荻 5587
-缱 5588
-绻 5589
-嫚 5590
-铟 5591
-饷 5592
-醐 5593
-伫 5594
-澶 5595
-郇 5596
-蹚 5597
-藿 5598
-鳕 5599
-蝈 5600
-钯 5601
-铍 5602
-骠 5603
-盅 5604
-蜊 5605
-腭 5606
-谘 5607
-孛 5608
-豇 5609
-囫 5610
-囵 5611
-抿 5612
-楣 5613
-廾 5614
-貔 5615
-貅 5616
-蛉 5617
-猹 5618
-蚴 5619
-轱 5620
-葚 5621
-胗 5622
-鸮 5623
-篦 5624
-谆 5625
-篑 5626
-莅 5627
-砷 5628
-蝾 5629
-疴 5630
-葺 5631
-瘴 5632
-滹 5633
-砭 5634
-噌 5635
-鸾 5636
-珙 5637
-碣 5638
-餮 5639
-荸 5640
-荠 5641
-犄 5642
-歙 5643
-樾 5644
-淙 5645
-痢 5646
-濯 5647
-轫 5648
-琮 5649
-啜 5650
-闳 5651
-椁 5652
-蓼 5653
-垴 5654
-唷 5655
-炔 5656
-峁 5657
-囹 5658
-尕 5659
-嗪 5660
-缎 5661
-拚 5662
-稔 5663
-牍 5664
-赳 5665
-忪 5666
-菖 5667
-佃 5668
-埂 5669
-宓 5670
-瞠 5671
-洹 5672
-锲 5673
-睑 5674
-攫 5675
-竽 5676
-蹩 5677
-慜 5678
-锉 5679
-羧 5680
-崧 5681
-醺 5682
-舐 5683
-讫 5684
-熵 5685
-▁GONNA 5686
-瘢 5687
-秭 5688
-跄 5689
-绀 5690
-懑 5691
-弭 5692
-萋 5693
-篁 5694
-缛 5695
-茭 5696
-吠 5697
-鲑 5698
-幔 5699
-潺 5700
-鹈 5701
-鹕 5702
-椴 5703
-哕 5704
-剜 5705
-湎 5706
-玑 5707
-槃 5708
-暌 5709
-蹒 5710
-跚 5711
-恣 5712
-磬 5713
-悭 5714
-劾 5715
-唳 5716
-绉 5717
-枇 5718
-蜱 5719
-瞟 5720
-膈 5721
-磴 5722
-嶂 5723
-苫 5724
-邡 5725
-骈 5726
-惴 5727
-硖 5728
-鳜 5729
-羸 5730
-秣 5731
-殚 5732
-桷 5733
-罔 5734
-颦 5735
-桁 5736
-鸩 5737
-孱 5738
-伥 5739
-愎 5740
-圄 5741
-贲 5742
-旖 5743
-荥 5744
-徇 5745
-镌 5746
-偈 5747
-敝 5748
-刎 5749
-跬 5750
-欸 5751
-髌 5752
-椤 5753
-觥 5754
-踟 5755
-斡 5756
-陉 5757
-谡 5758
-龅 5759
-鸨 5760
-豢 5761
-豉 5762
-悻 5763
-曈 5764
-茼 5765
-谗 5766
-忖 5767
-牯 5768
-痂 5769
-虢 5770
-馓 5771
-跖 5772
-聿 5773
-箅 5774
-塅 5775
-丼 5776
-獐 5777
-肏 5778
-逄 5779
-钡 5780
-叒 5781
-霭 5782
-鲮 5783
-凫 5784
-鹥 5785
-鳙 5786
-玦 5787
-蒡 5788
-嘬 5789
-鹗 5790
-鬄 5791
-鎏 5792
-嘤 5793
-绦 5794
-涔 5795
-齑 5796
-蒌 5797
-墘 5798
-俠 5799
-蛭 5800
-薅 5801
-叕 5802
-砧 5803
-嘧 5804
-媺 5805
-蚵 5806
-楽 5807
-浄 5808
-厍 5809
-鳊 5810
-泂 5811
-龋 5812
-瓒 5813
-瑧 5814
-邨 5815
-峣 5816
-蚺 5817
-鲉 5818
-滟 5819
-堑 5820
-豳 5821
-骧 5822
-艹 5823
-柾 5824
-鬣 5825
-眦 5826
-畦 5827
-虬 5828
-睨 5829
-飨 5830
-蘖 5831
-羟 5832
-瓤 5833
-岫 5834
-惇 5835
-鲵 5836
-痦 5837
-笤 5838
-憙 5839
-痩 5840
-煋 5841
-媤 5842
-佤 5843
-羮 5844
-鏊 5845
-昇 5846
-蛱 5847
-珅 5848
-庋 5849
-搵 5850
-旸 5851
-岿 5852
-亓 5853
-揸 5854
-谂 5855
-淠 5856
-糅 5857
-儆 5858
-苕 5859
-刿 5860
-呒 5861
-岙 5862
-荜 5863
-玧 5864
-鄠 5865
-讬 5866
-祕 5867
-箦 5868
-醚 5869
-膻 5870
-笕 5871
-蛏 5872
-哞 5873
-饸 5874
-饹 5875
-愻 5876
-汫 5877
-鹇 5878
-栉 5879
-沇 5880
-擤 5881
-徳 5882
-黢 5883
-狍 5884
-錫 5885
-暝 5886
-機 5887
-鉅 5888
-菓 5889
-廋 5890
-橛 5891
-羣 5892
-笊 5893
-魃 5894
-掼 5895
-魑 5896
-靥 5897
-酔 5898
-铱 5899
-峄 5900
-哋 5901
-畹 5902
-鍪 5903
-髀 5904
-嚄 5905
-秾 5906
-苾 5907
-孓 5908
-汆 5909
-嗟 5910
-锺 5911
-睥 5912
-炝 5913
-怔 5914
-咛 5915
-巉 5916
-墒 5917
-岘 5918
-禛 5919
-陟 5920
-皲 5921
-萘 5922
-妣 5923
-芃 5924
-煸 5925
-郦 5926
-蒗 5927
-仝 5928
-抻 5929
-苜 5930
-蓿 5931
-鎵 5932
-減 5933
-燧 5934
-娭 5935
-毑 5936
-诂 5937
-烔 5938
-猗 5939
-哏 5940
-氙 5941
-匯 5942
-颛 5943
-鞣 5944
-笺 5945
-枖 5946
-忾 5947
-黉 5948
-埯 5949
-敩 5950
-玏 5951
-钺 5952
-纥 5953
-佈 5954
-沖 5955
-蚜 5956
-莨 5957
-菟 5958
-麇 5959
-沤 5960
-耦 5961
-赭 5962
-祊 5963
-璄 5964
-旼 5965
-浞 5966
-痄 5967
-蔸 5968
-璎 5969
-屛 5970
-錤 5971
-弢 5972
-绨 5973
-員 5974
-誕 5975
-祜 5976
-勍 5977
-浉 5978
-娑 5979
-呓 5980
-啉 5981
-嗐 5982
-弁 5983
-绺 5984
-撷 5985
-崑 5986
-诌 5987
-標 5988
-甯 5989
-俣 5990
-趔 5991
-趄 5992
-垠 5993
-赟 5994
-馐 5995
-畑 5996
-給 5997
-幣 5998
-產 5999
-恵 6000
-併 6001
-蒹 6002
-葭 6003
-後 6004
-瀍 6005
-愠 6006
-莛 6007
-蝰 6008
-鹮 6009
-逶 6010
-侪 6011
-蒽 6012
-巽 6013
-瓴 6014
-鲱 6015
-薙 6016
-過 6017
-億 6018
-車 6019
-鲇 6020
-淨 6021
-嗎 6022
-诨 6023
-靚 6024
-內 6025
-糁 6026
-錾 6027
-刈 6028
-滯 6029
-炆 6030
-徂 6031
-傩 6032
-鲺 6033
-叟 6034
-埗 6035
-篠 6036
-焐 6037
-暻 6038
-盃 6039
-髻 6040
-樘 6041
-墈 6042
-菉 6043
-巯 6044
-嘌 6045
-遒 6046
-鼋 6047
-匍 6048
-匐 6049
-臜 6050
-馔 6051
-鲎 6052
-獠 6053
-蟇 6054
-栄 6055
-騎 6056
-賽 6057
-場 6058
-幾 6059
-鐘 6060
-镛 6061
-鸶 6062
-镆 6063
-窸 6064
-庠 6065
-蒺 6066
-溱 6067
-倮 6068
-楪 6069
-帀 6070
-躶 6071
-洰 6072
-圉 6073
-圊 6074
-捨 6075
-谝 6076
-呔 6077
-勖 6078
-揖 6079
-喈 6080
-霰 6081
-觋 6082
-嫪 6083
-毐 6084
-繇 6085
-珐 6086
-馃 6087
-孃 6088
-逖 6089
-骶 6090
-喬 6091
-奧 6092
-風 6093
-裵 6094
-胍 6095
-確 6096
-揠 6097
-榀 6098
-聒 6099
-谪 6100
-歘 6101
-粿 6102
-舾 6103
-聩 6104
-嫘 6105
-砟 6106
-侉 6107
-捯 6108
-饬 6109
-囏 6110
-喙 6111
-笥 6112
-燿 6113
-鮀 6114
-芡 6115
-蛄 6116
-铳 6117
-挲 6118
-笞 6119
-廿 6120
-蠹 6121
-湋 6122
-暎 6123
-霙 6124
-颔 6125
-苁 6126
-啮 6127
-囖 6128
-寤 6129
-炟 6130
-乩 6131
-熥 6132
-桡 6133
-阈 6134
-孖 6135
-鐢 6136
-衾 6137
-怍 6138
-沆 6139
-囿 6140
-胬 6141
-陲 6142
-缦 6143
-誇 6144
-醮 6145
-箬 6146
-盥 6147
-鹘 6148
-诳 6149
-氡 6150
-狎 6151
-枧 6152
-谄 6153
-芣 6154
-苢 6155
-俤 6156
-誊 6157
-殄 6158
-辋 6159
-係 6160
-迤 6161
-謦 6162
-簰 6163
-滓 6164
-嬢 6165
-倏 6166
-睺 6167
-滏 6168
-脘 6169
-嗙 6170
-谥 6171
-歃 6172
-锃 6173
-欻 6174
-挼 6175
-襙 6176
-檄 6177
-龇 6178
-楫 6179
-咵 6180
-徭 6181
-闱 6182
-嚅 6183
-鳑 6184
-鲏 6185
-佞 6186
-箜 6187
-篌 6188
-蹑 6189
-喑 6190
-胄 6191
-鞥 6192
-蟥 6193
-骢 6194
-蹙 6195
-柰 6196
-蕤 6197
-癸 6198
-哙 6199
-睚 6200
-绾 6201
-篾 6202
-鳏 6203
-谲 6204
-袤 6205
-翳 6206
-蹰 6207
-槊 6208
-黠 6209
-姒 6210
-锱 6211
-猢 6212
-狲 6213
-粝 6214
-戕 6215
-茕 6216
-瀣 6217
-踽 6218
-绶 6219
-媾 6220
-舢 6221
-螯 6222
-茏 6223
-廪 6224
-诰 6225
-辇 6226
-琚 6227
-汜 6228
-洇 6229
-還 6230
-遽 6231
-槁 6232
-靼 6233
-髡 6234
-鸬 6235
-鹚 6236
-捭 6237
-黩 6238
-俶 6239
-個 6240
-圜 6241
-颞 6242
-苻 6243
-恽 6244
-腧 6245
-甾 6246
-辎 6247
-顼 6248
-阗 6249
-鬻 6250
-鬶 6251
-沔 6252
-狃 6253
-#0 6254
-#1 6255
-#2 6256
diff --git a/egs/aishell/ASR/seamlessm4t/train.py b/egs/aishell/ASR/seamlessm4t/train.py
deleted file mode 100644
index 4802473c9..000000000
--- a/egs/aishell/ASR/seamlessm4t/train.py
+++ /dev/null
@@ -1,1254 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-
-./prepare.sh
-
-If you use --datatang-prob=0, then you don't need to run the above script.
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./pruned_transducer_stateless7/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 1 \
-  --use-fp16 1 \
-  --exp-dir pruned_transducer_stateless7/exp \
-  --full-libri 1 \
-  --max-duration 550
-"""
-
-
-import argparse
-import copy
-import logging
-import random
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple, Union
-
-import k2
-import optim
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from typing import List
-#from aishell import AIShell
-#from asr_datamodule import AsrDataModule
-from asr_datamodule import AishellAsrDataModule
-#from decoder import Decoder
-#from joiner import Joiner
-from lhotse import CutSet, load_manifest
-from lhotse.cut import Cut
-from lhotse.dataset.sampling.base import CutSampler
-from lhotse.utils import fix_random_seed
-#from model import Transducer
-from optim import Eden, ScaledAdam
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.functional import pad as pad_tensor
-from torch.utils.tensorboard import SummaryWriter
-#from zipformer import Zipformer
-
-from icefall import diagnostics
-#from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import load_checkpoint, remove_checkpoints
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import (
-    save_checkpoint_with_global_batch_idx,
-    update_averaged_model,
-)
-from icefall.dist import cleanup_dist, setup_dist, get_world_size, get_rank, get_local_rank
-from icefall.env import get_env_info
-from icefall.hooks import register_inf_check_hooks
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    MetricsTracker,
-    filter_uneven_sized_batch,
-    setup_logger,
-    str2bool,
-)
-
-from seamless_communication.models.unity import (
-    UnitTokenizer,
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-    load_unity_unit_tokenizer,
-)
-from fairseq2.generation import (
-    Seq2SeqGenerator,
-    SequenceGeneratorOptions,
-    SequenceGeneratorOutput,
-    SequenceToTextGenerator,
-    SequenceToTextOutput,
-)
-from fairseq2.data.text import (
-    SentencePieceDecoder,
-    SentencePieceEncoder,
-    SentencePieceModel,
-    TextTokenDecoder,
-    TextTokenEncoder,
-    TextTokenizer,
-    vocabulary_from_sentencepiece,
-)
-
-from label_smoothing import LabelSmoothingLoss
-
-LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
-
-
-def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
-    if isinstance(model, DDP):
-        # get underlying nn.Module
-        model = model.module
-    for module in model.modules():
-        if hasattr(module, "batch_count"):
-            module.batch_count = batch_count
-
-
-def add_model_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument(
-        "--num-encoder-layers",
-        type=str,
-        default="2,4,3,2,4",
-        help="Number of zipformer encoder layers, comma separated.",
-    )
-
-    parser.add_argument(
-        "--feedforward-dims",
-        type=str,
-        default="1024,1024,2048,2048,1024",
-        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
-    )
-
-    parser.add_argument(
-        "--nhead",
-        type=str,
-        default="8,8,8,8,8",
-        help="Number of attention heads in the zipformer encoder layers.",
-    )
-
-    parser.add_argument(
-        "--encoder-dims",
-        type=str,
-        default="384,384,384,384,384",
-        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
-    )
-
-    parser.add_argument(
-        "--attention-dims",
-        type=str,
-        default="192,192,192,192,192",
-        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
-        not the same as embedding dimension.""",
-    )
-
-    parser.add_argument(
-        "--encoder-unmasked-dims",
-        type=str,
-        default="256,256,256,256,256",
-        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
-        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
-        " worse.",
-    )
-
-    parser.add_argument(
-        "--zipformer-downsampling-factors",
-        type=str,
-        default="1,2,4,8,2",
-        help="Downsampling factor for each stack of encoder layers.",
-    )
-
-    parser.add_argument(
-        "--cnn-module-kernels",
-        type=str,
-        default="31,31,31,31,31",
-        help="Sizes of kernels in convolution modules",
-    )
-
-    parser.add_argument(
-        "--decoder-dim",
-        type=int,
-        default=512,
-        help="Embedding dimension in the decoder model.",
-    )
-
-    parser.add_argument(
-        "--joiner-dim",
-        type=int,
-        default=512,
-        help="""Dimension used in the joiner model.
-        Outputs from the encoder and decoder model are projected
-        to this dimension before adding.
-        """,
-    )
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--master-port",
-        type=int,
-        default=12354,
-        help="Master port to use for DDP training.",
-    )
-
-    parser.add_argument(
-        "--tensorboard",
-        type=str2bool,
-        default=True,
-        help="Should various information be logged in tensorboard.",
-    )
-
-    parser.add_argument(
-        "--num-epochs",
-        type=int,
-        default=30,
-        help="Number of epochs to train.",
-    )
-
-    parser.add_argument(
-        "--start-epoch",
-        type=int,
-        default=1,
-        help="""Resume training from this epoch. It should be positive.
-        If larger than 1, it will load checkpoint from
-        exp-dir/epoch-{start_epoch-1}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--start-batch",
-        type=int,
-        default=0,
-        help="""If positive, --start-epoch is ignored and
-        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="pruned_transducer_stateless7/exp",
-        help="""The experiment dir.
-        It specifies the directory where all training related
-        files, e.g., checkpoints, log, etc, are saved
-        """,
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        default="data/lang_char",
-        help="""The lang dir
-        It contains language related input files such as
-        "lexicon.txt"
-        """,
-    )
-
-    parser.add_argument(
-        "--base-lr", type=float, default=0.05, help="The base learning rate."
-    )
-
-    parser.add_argument(
-        "--lr-batches",
-        type=float,
-        default=5000,
-        help="""Number of steps that affects how rapidly the learning rate
-        decreases. We suggest not to change this.""",
-    )
-
-    parser.add_argument(
-        "--lr-epochs",
-        type=float,
-        default=6,
-        help="""Number of epochs that affects how rapidly the learning rate decreases.
-        """,
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=1,
-        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
-    )
-
-    parser.add_argument(
-        "--prune-range",
-        type=int,
-        default=5,
-        help="The prune range for rnnt loss, it means how many symbols(context)"
-        "we are using to compute the loss",
-    )
-
-    parser.add_argument(
-        "--lm-scale",
-        type=float,
-        default=0.25,
-        help="The scale to smooth the loss with lm "
-        "(output of prediction network) part.",
-    )
-
-    parser.add_argument(
-        "--am-scale",
-        type=float,
-        default=0.0,
-        help="The scale to smooth the loss with am (output of encoder network) part.",
-    )
-
-    parser.add_argument(
-        "--simple-loss-scale",
-        type=float,
-        default=0.5,
-        help="To get pruning ranges, we will calculate a simple version"
-        "loss(joiner is just addition), this simple loss also uses for"
-        "training (as a regularization item). We will scale the simple loss"
-        "with this parameter before adding to the final loss.",
-    )
-
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="The seed for random generators intended for reproducibility",
-    )
-
-    parser.add_argument(
-        "--print-diagnostics",
-        type=str2bool,
-        default=False,
-        help="Accumulate stats on activations, print them and exit.",
-    )
-
-    parser.add_argument(
-        "--inf-check",
-        type=str2bool,
-        default=False,
-        help="Add hooks to check for infinite module outputs and gradients.",
-    )
-
-    parser.add_argument(
-        "--save-every-n",
-        type=int,
-        default=4000,
-        help="""Save checkpoint after processing this number of batches"
-        periodically. We save checkpoint to exp-dir/ whenever
-        params.batch_idx_train % save_every_n == 0. The checkpoint filename
-        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
-        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
-        end of each epoch where `xxx` is the epoch number counting from 0.
-        """,
-    )
-
-    parser.add_argument(
-        "--keep-last-k",
-        type=int,
-        default=30,
-        help="""Only keep this number of checkpoints on disk.
-        For instance, if it is 3, there are only 3 checkpoints
-        in the exp-dir with filenames `checkpoint-xxx.pt`.
-        It does not affect checkpoints with name `epoch-xxx.pt`.
-        """,
-    )
-
-    parser.add_argument(
-        "--average-period",
-        type=int,
-        default=200,
-        help="""Update the averaged model, namely `model_avg`, after processing
-        this number of batches. `model_avg` is a separate version of model,
-        in which each floating-point parameter is the average of all the
-        parameters from the start of training. Each time we take the average,
-        we do: `model_avg = model * (average_period / batch_idx_train) +
-            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
-        """,
-    )
-
-    parser.add_argument(
-        "--use-fp16",
-        type=str2bool,
-        default=False,
-        help="Whether to use half precision training.",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    """Return a dict containing training parameters.
-
-    All training related parameters that are not passed from the commandline
-    are saved in the variable `params`.
-
-    Commandline options are merged into `params` after they are parsed, so
-    you can also access them via `params`.
-
-    Explanation of options saved in `params`:
-
-        - best_train_loss: Best training loss so far. It is used to select
-                           the model that has the lowest training loss. It is
-                           updated during the training.
-
-        - best_valid_loss: Best validation loss so far. It is used to select
-                           the model that has the lowest validation loss. It is
-                           updated during the training.
-
-        - best_train_epoch: It is the epoch that has the best training loss.
-
-        - best_valid_epoch: It is the epoch that has the best validation loss.
-
-        - batch_idx_train: Used to writing statistics to tensorboard. It
-                           contains number of batches trained so far across
-                           epochs.
-
-        - log_interval:  Print training loss if batch_idx % log_interval` is 0
-
-        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
-        - valid_interval:  Run validation if batch_idx % valid_interval is 0
-
-        - feature_dim: The model input dim. It has to match the one used
-                       in computing features.
-
-        - subsampling_factor:  The subsampling factor for the model.
-
-        - encoder_dim: Hidden dim for multi-head attention model.
-
-        - num_decoder_layers: Number of decoder layer of transformer decoder.
-
-        - warm_step: The warmup period that dictates the decay of the
-              scale on "simple" (un-pruned) loss.
-    """
-    params = AttributeDict(
-        {
-            "frame_shift_ms": 10.0,
-            "allowed_excess_duration_ratio": 0.1,
-            "best_train_loss": float("inf"),
-            "best_valid_loss": float("inf"),
-            "best_train_epoch": -1,
-            "best_valid_epoch": -1,
-            "batch_idx_train": 0,
-            "log_interval": 50,
-            "reset_interval": 200,
-            "valid_interval": 3000,  # For the 100h subset, use 800
-            # parameters for zipformer
-            "feature_dim": 80,
-            "subsampling_factor": 4,  # not passed in, this is fixed.
-            "warm_step": 100,
-            "env_info": get_env_info(),
-        }
-    )
-
-    return params
-
-
-# def get_transducer_model(params: AttributeDict) -> nn.Module:
-#     encoder = get_encoder_model(params)
-#     decoder = get_decoder_model(params)
-#     joiner = get_joiner_model(params)
-
-#     model = Transducer(
-#         encoder=encoder,
-#         decoder=decoder,
-#         joiner=joiner,
-#         encoder_dim=int(params.encoder_dims.split(",")[-1]),
-#         decoder_dim=params.decoder_dim,
-#         joiner_dim=params.joiner_dim,
-#         vocab_size=params.vocab_size,
-#     )
-#     return model
-
-
-def load_checkpoint_if_available(
-    params: AttributeDict,
-    model: nn.Module,
-    model_avg: nn.Module = None,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[LRSchedulerType] = None,
-) -> Optional[Dict[str, Any]]:
-    """Load checkpoint from file.
-
-    If params.start_batch is positive, it will load the checkpoint from
-    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
-    params.start_epoch is larger than 1, it will load the checkpoint from
-    `params.start_epoch - 1`.
-
-    Apart from loading state dict for `model` and `optimizer` it also updates
-    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
-    and `best_valid_loss` in `params`.
-
-    Args:
-      params:
-        The return value of :func:`get_params`.
-      model:
-        The training model.
-      model_avg:
-        The stored model averaged from the start of training.
-      optimizer:
-        The optimizer that we are using.
-      scheduler:
-        The scheduler that we are using.
-    Returns:
-      Return a dict containing previously saved training info.
-    """
-    if params.start_batch > 0:
-        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
-    elif params.start_epoch > 1:
-        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
-    else:
-        return None
-
-    assert filename.is_file(), f"{filename} does not exist!"
-
-    saved_params = load_checkpoint(
-        filename,
-        model=model,
-        model_avg=model_avg,
-        optimizer=optimizer,
-        scheduler=scheduler,
-    )
-
-    keys = [
-        "best_train_epoch",
-        "best_valid_epoch",
-        "batch_idx_train",
-        "best_train_loss",
-        "best_valid_loss",
-    ]
-    for k in keys:
-        params[k] = saved_params[k]
-
-    if params.start_batch > 0:
-        if "cur_epoch" in saved_params:
-            params["start_epoch"] = saved_params["cur_epoch"]
-
-    return saved_params
-
-
-def save_checkpoint(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    model_avg: Optional[nn.Module] = None,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[LRSchedulerType] = None,
-    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
-    rank: int = 0,
-) -> None:
-    """Save model, optimizer, scheduler and training stats to file.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The training model.
-      model_avg:
-        The stored model averaged from the start of training.
-      optimizer:
-        The optimizer used in the training.
-      sampler:
-       The sampler for the training dataset.
-      scaler:
-        The scaler used for mix precision training.
-    """
-    if rank != 0:
-        return
-    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
-    save_checkpoint_impl(
-        filename=filename,
-        model=model,
-        model_avg=model_avg,
-        params=params,
-        optimizer=optimizer,
-        scheduler=scheduler,
-        sampler=sampler,
-        scaler=scaler,
-        rank=rank,
-    )
-
-    if params.best_train_epoch == params.cur_epoch:
-        best_train_filename = params.exp_dir / "best-train-loss.pt"
-        copyfile(src=filename, dst=best_train_filename)
-
-    if params.best_valid_epoch == params.cur_epoch:
-        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
-        copyfile(src=filename, dst=best_valid_filename)
-
-def compute_loss(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    text_tokenizer_encoder: SentencePieceEncoder,
-    batch: dict,
-    is_training: bool,
-) -> Tuple[Tensor, MetricsTracker]:
-    """
-    Compute RNN-T loss given the model and its inputs.
-
-    Args:
-      params:
-        Parameters for training. See :func:`get_params`.
-      model:
-        The model for training. It is an instance of Zipformer in our case.
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      is_training:
-        True for training. False for validation. When it is True, this
-        function enables autograd during computation; when it is False, it
-        disables autograd.
-     warmup: a floating point value which increases throughout training;
-        values >= 1.0 are fully warmed up and have all modules present.
-    """
-    # For the uneven-sized batch, the total duration after padding would possibly
-    # cause OOM. Hence, for each batch, which is sorted descendingly by length,
-    # we simply drop the last few shortest samples, so that the retained total frames
-    # (after padding) would not exceed `allowed_max_frames`:
-    # `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
-    # where `max_frames = max_duration * 1000 // frame_shift_ms`.
-    # We set allowed_excess_duration_ratio=0.1.
-    if isinstance(model, DDP):
-        # get underlying nn.Module
-        model = model.module
-    def _batch_tensors(tensors: List[Tensor], pad_value: Any) -> Tensor:
-        padding_size = max(tensor.shape[0] for tensor in tensors)
-        dims = len(tensors[0].shape)
-        padded_tensors = []
-        for tensor in tensors:
-            padding = [0] * 2 * dims
-            padding[-1] = padding_size - tensor.shape[0]
-            padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value))
-        return torch.stack([tensor for tensor in padded_tensors], dim=0)
-
-    max_frames = params.max_duration * 1000 // params.frame_shift_ms
-    allowed_max_frames = int(max_frames * (1.0 + params.allowed_excess_duration_ratio))
-    batch = filter_uneven_sized_batch(batch, allowed_max_frames)
-
-    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
-    feature = batch["inputs"]
-    # at entry, feature is (N, T, C)
-    assert feature.ndim == 3
-    feature = feature.to(device)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    batch_idx_train = params.batch_idx_train
-    warm_step = params.warm_step
-
-    texts = batch["supervisions"]["text"]        
-    text_tokens_list = [text_tokenizer_encoder(text) for text in texts]
-    prev_outputs_tokens = _batch_tensors(
-        [tokens[:-1] for tokens in text_tokens_list], pad_value=params.pad_idx
-    )
-    target_tokens = _batch_tensors(
-        [tokens[1:] for tokens in text_tokens_list], pad_value=params.pad_idx
-    )
-    target_lengths = torch.LongTensor(
-        [tokens.shape[0] - 1 for tokens in text_tokens_list]
-    )
-    decoder_criterion = LabelSmoothingLoss(ignore_index=params.pad_idx, label_smoothing=0.1, reduction="sum")
-    ignore_prefix_size = 1 # ignroe the lang code prediction
-
-    with torch.set_grad_enabled(is_training):
-        speech_encoder_out, speech_encoder_padding_mask = model.encode_speech(
-            seqs=feature,
-            seq_lens=feature_lens,
-        )
-        #assert batch.speech_to_text.prev_output_tokens is not None
-        text_decoder_out, text_decoder_padding_mask = model.decode(
-            seqs=prev_outputs_tokens.to(device),
-            seq_lens=target_lengths.to(device),
-            encoder_output=speech_encoder_out,
-            encoder_padding_mask=speech_encoder_padding_mask,
-        )
-        text_logits = model.final_proj(text_decoder_out)
-        text_logits = text_logits[:, ignore_prefix_size:, :]
-        target_tokens = target_tokens[:, ignore_prefix_size:]
-        loss = decoder_criterion(text_logits, target_tokens.to(device))
-
-    assert loss.requires_grad == is_training
-
-    info = MetricsTracker()
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
-
-    # Note: We use reduction=sum while computing the loss.
-    info["loss"] = loss.detach().cpu().item()
-
-    return loss, info
-
-
-def compute_validation_loss(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    text_tokenizer_encoder: SentencePieceEncoder,
-    valid_dl: torch.utils.data.DataLoader,
-    world_size: int = 1,
-) -> MetricsTracker:
-    """Run the validation process."""
-    model.eval()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(valid_dl):
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            text_tokenizer_encoder=text_tokenizer_encoder,
-            batch=batch,
-            is_training=False,
-        )
-        assert loss.requires_grad is False
-        tot_loss = tot_loss + loss_info
-
-    if world_size > 1:
-        tot_loss.reduce(loss.device)
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    if loss_value < params.best_valid_loss:
-        params.best_valid_epoch = params.cur_epoch
-        params.best_valid_loss = loss_value
-
-    return tot_loss
-
-
-def train_one_epoch(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    optimizer: torch.optim.Optimizer,
-    scheduler: LRSchedulerType,
-    text_tokenizer_encoder: SentencePieceEncoder,
-    train_dl: torch.utils.data.DataLoader,
-    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
-    model_avg: Optional[nn.Module] = None,
-    tb_writer: Optional[SummaryWriter] = None,
-    world_size: int = 1,
-    rank: int = 0,
-) -> None:
-    """Train the model for one epoch.
-
-    The training loss from the mean of all frames is saved in
-    `params.train_loss`. It runs the validation process every
-    `params.valid_interval` batches.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The model for training.
-      optimizer:
-        The optimizer we are using.
-      scheduler:
-        The learning rate scheduler, we call step() every step.
-      train_dl:
-        Dataloader for the training dataset.
-      valid_dl:
-        Dataloader for the validation dataset.
-      scaler:
-        The scaler used for mix precision training.
-      model_avg:
-        The stored model averaged from the start of training.
-      tb_writer:
-        Writer to write log messages to tensorboard.
-      world_size:
-        Number of nodes in DDP training. If it is 1, DDP is disabled.
-      rank:
-        The rank of the node in DDP training. If no DDP is used, it should
-        be set to 0.
-    """
-    model.train()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(train_dl):
-        params.batch_idx_train += 1
-        batch_size = len(batch["supervisions"]["text"])
-
-        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
-                loss, loss_info = compute_loss(
-                    params=params,
-                    model=model,
-                    text_tokenizer_encoder=text_tokenizer_encoder,
-                    batch=batch,
-                    is_training=True,
-                )
-            # summary stats
-            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-            # NOTE: We use reduction==sum and loss is computed over utterances
-            # in the batch and there is no normalization to it so far.
-            scaler.scale(loss).backward()
-            set_batch_count(model, params.batch_idx_train)
-            scheduler.step_batch(params.batch_idx_train)
-
-            scaler.step(optimizer)
-            scaler.update()
-            optimizer.zero_grad()
-        except:  # noqa
-            display_and_save_batch(batch, params=params)
-            raise
-
-        if params.print_diagnostics and batch_idx == 5:
-            return
-
-        if (
-            rank == 0
-            and params.batch_idx_train > 0
-            and params.batch_idx_train % params.average_period == 0
-        ):
-            update_averaged_model(
-                params=params,
-                model_cur=model,
-                model_avg=model_avg,
-            )
-
-        if (
-            params.batch_idx_train > 0
-            and params.batch_idx_train % params.save_every_n == 0
-        ):
-            save_checkpoint_with_global_batch_idx(
-                out_dir=params.exp_dir,
-                global_batch_idx=params.batch_idx_train,
-                model=model,
-                model_avg=model_avg,
-                params=params,
-                optimizer=optimizer,
-                scheduler=scheduler,
-                sampler=train_dl.sampler,
-                scaler=scaler,
-                rank=rank,
-            )
-            remove_checkpoints(
-                out_dir=params.exp_dir,
-                topk=params.keep_last_k,
-                rank=rank,
-            )
-
-        if batch_idx % 100 == 0 and params.use_fp16:
-            # If the grad scale was less than 1, try increasing it.    The _growth_interval
-            # of the grad scaler is configurable, but we can't configure it to have different
-            # behavior depending on the current grad scale.
-            cur_grad_scale = scaler._scale.item()
-            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
-                scaler.update(cur_grad_scale * 2.0)
-            if cur_grad_scale < 0.01:
-                logging.warning(f"Grad scale is small: {cur_grad_scale}")
-            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
-        if batch_idx % params.log_interval == 0:
-            cur_lr = scheduler.get_last_lr()[0]
-            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
-
-            logging.info(
-                f"Epoch {params.cur_epoch}, "
-                f"batch {batch_idx}, loss[{loss_info}], "
-                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
-                f"lr: {cur_lr:.2e}, "
-                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
-            )
-
-            if tb_writer is not None:
-                tb_writer.add_scalar(
-                    "train/learning_rate", cur_lr, params.batch_idx_train
-                )
-
-                loss_info.write_summary(
-                    tb_writer, "train/current_", params.batch_idx_train
-                )
-                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
-                if params.use_fp16:
-                    tb_writer.add_scalar(
-                        "train/grad_scale",
-                        cur_grad_scale,
-                        params.batch_idx_train,
-                    )
-
-        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
-            logging.info("Computing validation loss")
-            valid_info = compute_validation_loss(
-                params=params,
-                model=model,
-                text_tokenizer_encoder=text_tokenizer_encoder,
-                valid_dl=valid_dl,
-                world_size=world_size,
-            )
-            model.train()
-            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
-            logging.info(
-                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
-            )
-            if tb_writer is not None:
-                valid_info.write_summary(
-                    tb_writer, "train/valid_", params.batch_idx_train
-                )
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    params.train_loss = loss_value
-    if params.train_loss < params.best_train_loss:
-        params.best_train_epoch = params.cur_epoch
-        params.best_train_loss = params.train_loss
-
-
-def run(rank, world_size, args):
-    """
-    Args:
-      rank:
-        It is a value between 0 and `world_size-1`, which is
-        passed automatically by `mp.spawn()` in :func:`main`.
-        The node with rank 0 is responsible for saving checkpoint.
-      world_size:
-        Number of GPUs for DDP training.
-      args:
-        The return value of get_parser().parse_args()
-    """
-    params = get_params()
-    params.update(vars(args))
-
-    fix_random_seed(params.seed)
-    # rank = get_rank()
-    # world_size = get_world_size()
-    # setup_dist(rank, world_size, use_ddp_launch=True)
-    setup_dist(use_ddp_launch=True)
-
-    setup_logger(f"{params.exp_dir}/log/log-train")
-    logging.info("Training started")
-
-    if args.tensorboard and rank == 0:
-        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
-    else:
-        tb_writer = None
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", rank)
-    logging.info(f"Device: {device}")
-
-
-
-
-    logging.info("About to create model")
-    model_name_or_card = "seamlessM4T_medium"
-    lang = "cmn"
-    model = load_unity_model(model_name_or_card, device="cpu", dtype=torch.float32)
-    del model.t2u_model
-    del model.text_encoder
-    del model.text_encoder_frontend
-    # print(vars(model))
-    # exit(0)
-    text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-    text_tokenizer_encoder = SentencePieceEncoder(
-        text_tokenizer.model,
-        prefix_tokens=["</s>", f"__{lang}__"],
-        suffix_tokens=["</s>"],
-    )
-    #params.eos_idx = text_tokenizer.model.eos_idx
-    params.pad_idx = text_tokenizer.model.pad_idx
-    logging.info(params)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    assert params.save_every_n >= params.average_period
-    model_avg: Optional[nn.Module] = None
-    if rank == 0:
-        # model_avg is only used with rank 0
-        model_avg = copy.deepcopy(model).to(torch.float64)
-
-    assert params.start_epoch > 0, params.start_epoch
-    checkpoints = load_checkpoint_if_available(
-        params=params, model=model, model_avg=model_avg
-    )
-
-    model.to(device)
-    if world_size > 1:
-        logging.info("Using DDP")
-        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
-
-    #parameters_names = []
-    #parameters_names.append(
-    #    [name_param_pair[0] for name_param_pair in model.named_parameters()]
-    #)
-    # optimizer = ScaledAdam(
-    #     model.parameters(),
-    #     lr=params.base_lr,
-    #     clipping_scale=2.0,
-    #     parameters_names=parameters_names,
-    # )
-    optimizer = ScaledAdam(
-        model.parameters(),
-        lr=params.base_lr,
-        clipping_scale=2.0,
-    )
-    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
-
-    if checkpoints and "optimizer" in checkpoints:
-        logging.info("Loading optimizer state dict")
-        optimizer.load_state_dict(checkpoints["optimizer"])
-
-    if (
-        checkpoints
-        and "scheduler" in checkpoints
-        and checkpoints["scheduler"] is not None
-    ):
-        logging.info("Loading scheduler state dict")
-        scheduler.load_state_dict(checkpoints["scheduler"])
-
-    if params.print_diagnostics:
-        opts = diagnostics.TensorDiagnosticOptions(
-            2**22
-        )  # allow 4 megabytes per sub-module
-        diagnostic = diagnostics.attach_diagnostics(model, opts)
-
-    if params.inf_check:
-        register_inf_check_hooks(model)
-
-    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 20 seconds
-        #
-        # Caution: There is a reason to select 20.0 here. Please see
-        # ../local/display_manifest_statistics.py
-        #
-        # You should use ../local/display_manifest_statistics.py to get
-        # an utterance duration distribution for your dataset to select
-        # the threshold
-        if c.duration < 1.0 or c.duration > 12.0:
-            logging.warning(
-                f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
-            )
-            return False
-
-        # In pruned RNN-T, we require that T >= S
-        # where T is the number of feature frames after subsampling
-        # and S is the number of tokens in the utterance
-
-        # In ./zipformer.py, the conv module uses the following expression
-        # for subsampling
-        # T = ((c.num_frames - 7) // 2 + 1) // 2
-        # tokens = sp.encode(c.supervisions[0].text, out_type=str)
-
-        # if T < len(tokens):
-        #     logging.warning(
-        #         f"Exclude cut with ID {c.id} from training. "
-        #         f"Number of frames (before subsampling): {c.num_frames}. "
-        #         f"Number of frames (after subsampling): {T}. "
-        #         f"Text: {c.supervisions[0].text}. "
-        #         f"Tokens: {tokens}. "
-        #         f"Number of tokens: {len(tokens)}"
-        #     )
-        #     return False
-
-        return True
-
-    #aishell = AIShell(manifest_dir=args.manifest_dir)
-    #train_cuts = aishell.train_cuts()
-    #asr_datamodule = AishellAsrDataModule(args)
-    
-    aishell = AishellAsrDataModule(args)
-    # train_cuts = asr_datamodule.train_cuts()
-    # train_cuts = train_cuts.filter(remove_short_and_long_utt)
-
-    # if args.enable_musan:
-    #     cuts_musan = load_manifest(Path(args.manifest_dir) / "musan_cuts.jsonl.gz")
-    # else:
-    #     cuts_musan = None
-
-    
-
-    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
-        # We only load the sampler's state dict when it loads a checkpoint
-        # saved in the middle of an epoch
-        sampler_state_dict = checkpoints["sampler"]
-    else:
-        sampler_state_dict = None
-
-    # train_dl = asr_datamodule.train_dataloaders(
-    #     train_cuts,
-    #     on_the_fly_feats=False,
-    #     cuts_musan=cuts_musan,
-    #     sampler_state_dict=sampler_state_dict,
-    # )
-
-    # valid_cuts = aishell.valid_cuts()
-    # valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
-    train_dl = aishell.train_dataloaders(aishell.train_cuts())
-    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
-    # if not params.print_diagnostics:
-    #     scan_pessimistic_batches_for_oom(
-    #         model=model,
-    #         train_dl=train_dl,
-    #         optimizer=optimizer,
-    #         graph_compiler=graph_compiler,
-    #         params=params,
-    #     )
-
-    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
-    if checkpoints and "grad_scaler" in checkpoints:
-        logging.info("Loading grad scaler state dict")
-        scaler.load_state_dict(checkpoints["grad_scaler"])
-
-    logging.info(f"start training from epoch {params.start_epoch}")
-    for epoch in range(params.start_epoch, params.num_epochs + 1):
-        scheduler.step_epoch(epoch - 1)
-        fix_random_seed(params.seed + epoch - 1)
-        train_dl.sampler.set_epoch(epoch - 1)
-
-        if tb_writer is not None:
-            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
-        params.cur_epoch = epoch
-
-        train_one_epoch(
-            params=params,
-            model=model,
-            model_avg=model_avg,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            text_tokenizer_encoder=text_tokenizer_encoder,
-            train_dl=train_dl,
-            valid_dl=valid_dl,
-            scaler=scaler,
-            tb_writer=tb_writer,
-            world_size=world_size,
-            rank=rank,
-        )
-
-        if params.print_diagnostics:
-            diagnostic.print_diagnostics()
-            break
-
-        save_checkpoint(
-            params=params,
-            model=model,
-            model_avg=model_avg,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            sampler=train_dl.sampler,
-            scaler=scaler,
-            rank=rank,
-        )
-
-    logging.info("Done!")
-
-    if world_size > 1:
-        torch.distributed.barrier()
-        cleanup_dist()
-
-
-def display_and_save_batch(
-    batch: dict,
-    params: AttributeDict,
-) -> None:
-    """Display the batch statistics and save the batch into disk.
-
-    Args:
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      params:
-        Parameters for training. See :func:`get_params`.
-    """
-    from lhotse.utils import uuid4
-
-    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
-    logging.info(f"Saving batch to {filename}")
-    torch.save(batch, filename)
-
-    supervisions = batch["supervisions"]
-    features = batch["inputs"]
-
-    logging.info(f"features shape: {features.shape}")
-
-    # y = graph_compiler.texts_to_ids(supervisions["text"])
-    # num_tokens = sum(len(i) for i in y)
-    # logging.info(f"num tokens: {num_tokens}")
-
-
-def scan_pessimistic_batches_for_oom(
-    model: Union[nn.Module, DDP],
-    train_dl: torch.utils.data.DataLoader,
-    optimizer: torch.optim.Optimizer,
-    params: AttributeDict,
-    text_tokenizer_encoder: SentencePieceEncoder,
-):
-    from lhotse.dataset import find_pessimistic_batches
-
-    logging.info(
-        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
-    )
-    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
-    for criterion, cuts in batches.items():
-        batch = train_dl.dataset[cuts]
-        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
-                loss, _ = compute_loss(
-                    params=params,
-                    model=model,
-                    text_tokenizer_encoder=text_tokenizer_encoder,
-                    batch=batch,
-                    is_training=True,
-                )
-            loss.backward()
-            optimizer.zero_grad()
-        except Exception as e:
-            if "CUDA out of memory" in str(e):
-                logging.error(
-                    "Your GPU ran out of memory with the current "
-                    "max_duration setting. We recommend decreasing "
-                    "max_duration and trying again.\n"
-                    f"Failing criterion: {criterion} "
-                    f"(={crit_values[criterion]}) ..."
-                )
-            display_and_save_batch(batch, params=params)
-            raise
-        logging.info(
-            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
-        )
-
-
-def main():
-    parser = get_parser()
-    AishellAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    world_size = get_world_size()
-    rank = get_rank()
-    assert world_size >= 1
-
-    run(rank=rank, world_size=world_size, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/aishell/ASR/seamlessm4t/train2.py b/egs/aishell/ASR/seamlessm4t/train2.py
deleted file mode 100644
index 9d5cf4ab9..000000000
--- a/egs/aishell/ASR/seamlessm4t/train2.py
+++ /dev/null
@@ -1,1277 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-
-./prepare.sh
-
-If you use --datatang-prob=0, then you don't need to run the above script.
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./pruned_transducer_stateless7/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 1 \
-  --use-fp16 1 \
-  --exp-dir pruned_transducer_stateless7/exp \
-  --full-libri 1 \
-  --max-duration 550
-"""
-
-
-import argparse
-import copy
-import logging
-import random
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple, Union
-
-import k2
-import optim
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from typing import List
-#from aishell import AIShell
-#from asr_datamodule import AsrDataModule
-from asr_datamodule import AishellAsrDataModule
-#from decoder import Decoder
-#from joiner import Joiner
-from lhotse import CutSet, load_manifest
-from lhotse.cut import Cut
-from lhotse.dataset.sampling.base import CutSampler
-from lhotse.utils import fix_random_seed
-#from model import Transducer
-from optim import Eden, ScaledAdam
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.functional import pad as pad_tensor
-from torch.utils.tensorboard import SummaryWriter
-#from zipformer import Zipformer
-
-from icefall import diagnostics
-#from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import load_checkpoint, remove_checkpoints
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import (
-    save_checkpoint_with_global_batch_idx,
-    update_averaged_model,
-)
-from icefall.dist import cleanup_dist, setup_dist, get_world_size, get_rank, get_local_rank
-from icefall.env import get_env_info
-from icefall.hooks import register_inf_check_hooks
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    MetricsTracker,
-    filter_uneven_sized_batch,
-    setup_logger,
-    str2bool,
-)
-
-from seamless_communication.models.unity import (
-    UnitTokenizer,
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-    load_unity_unit_tokenizer,
-)
-from fairseq2.generation import (
-    Seq2SeqGenerator,
-    SequenceGeneratorOptions,
-    SequenceGeneratorOutput,
-    SequenceToTextGenerator,
-    SequenceToTextOutput,
-)
-from fairseq2.data.text import (
-    SentencePieceDecoder,
-    SentencePieceEncoder,
-    SentencePieceModel,
-    TextTokenDecoder,
-    TextTokenEncoder,
-    TextTokenizer,
-    vocabulary_from_sentencepiece,
-)
-from tokenizer import CharTokenizer
-from label_smoothing import LabelSmoothingLoss
-from fairseq2.nn.embedding import Embedding
-from fairseq2.nn.projection import TiedProjection
-
-LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
-
-
-def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
-    if isinstance(model, DDP):
-        # get underlying nn.Module
-        model = model.module
-    for module in model.modules():
-        if hasattr(module, "batch_count"):
-            module.batch_count = batch_count
-
-
-def add_model_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument(
-        "--num-encoder-layers",
-        type=str,
-        default="2,4,3,2,4",
-        help="Number of zipformer encoder layers, comma separated.",
-    )
-
-    parser.add_argument(
-        "--feedforward-dims",
-        type=str,
-        default="1024,1024,2048,2048,1024",
-        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
-    )
-
-    parser.add_argument(
-        "--nhead",
-        type=str,
-        default="8,8,8,8,8",
-        help="Number of attention heads in the zipformer encoder layers.",
-    )
-
-    parser.add_argument(
-        "--encoder-dims",
-        type=str,
-        default="384,384,384,384,384",
-        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
-    )
-
-    parser.add_argument(
-        "--attention-dims",
-        type=str,
-        default="192,192,192,192,192",
-        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
-        not the same as embedding dimension.""",
-    )
-
-    parser.add_argument(
-        "--encoder-unmasked-dims",
-        type=str,
-        default="256,256,256,256,256",
-        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
-        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
-        " worse.",
-    )
-
-    parser.add_argument(
-        "--zipformer-downsampling-factors",
-        type=str,
-        default="1,2,4,8,2",
-        help="Downsampling factor for each stack of encoder layers.",
-    )
-
-    parser.add_argument(
-        "--cnn-module-kernels",
-        type=str,
-        default="31,31,31,31,31",
-        help="Sizes of kernels in convolution modules",
-    )
-
-    parser.add_argument(
-        "--decoder-dim",
-        type=int,
-        default=512,
-        help="Embedding dimension in the decoder model.",
-    )
-
-    parser.add_argument(
-        "--joiner-dim",
-        type=int,
-        default=512,
-        help="""Dimension used in the joiner model.
-        Outputs from the encoder and decoder model are projected
-        to this dimension before adding.
-        """,
-    )
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--master-port",
-        type=int,
-        default=12354,
-        help="Master port to use for DDP training.",
-    )
-
-    parser.add_argument(
-        "--tensorboard",
-        type=str2bool,
-        default=True,
-        help="Should various information be logged in tensorboard.",
-    )
-
-    parser.add_argument(
-        "--num-epochs",
-        type=int,
-        default=30,
-        help="Number of epochs to train.",
-    )
-
-    parser.add_argument(
-        "--start-epoch",
-        type=int,
-        default=1,
-        help="""Resume training from this epoch. It should be positive.
-        If larger than 1, it will load checkpoint from
-        exp-dir/epoch-{start_epoch-1}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--start-batch",
-        type=int,
-        default=0,
-        help="""If positive, --start-epoch is ignored and
-        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="pruned_transducer_stateless7/exp",
-        help="""The experiment dir.
-        It specifies the directory where all training related
-        files, e.g., checkpoints, log, etc, are saved
-        """,
-    )
-
-    parser.add_argument(
-        "--lang-dir",
-        type=str,
-        default="data/lang_char",
-        help="""The lang dir
-        It contains language related input files such as
-        "lexicon.txt"
-        """,
-    )
-
-    parser.add_argument(
-        "--base-lr", type=float, default=0.05, help="The base learning rate."
-    )
-
-    parser.add_argument(
-        "--lr-batches",
-        type=float,
-        default=5000,
-        help="""Number of steps that affects how rapidly the learning rate
-        decreases. We suggest not to change this.""",
-    )
-
-    parser.add_argument(
-        "--lr-epochs",
-        type=float,
-        default=6,
-        help="""Number of epochs that affects how rapidly the learning rate decreases.
-        """,
-    )
-
-    parser.add_argument(
-        "--context-size",
-        type=int,
-        default=1,
-        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
-    )
-
-    parser.add_argument(
-        "--prune-range",
-        type=int,
-        default=5,
-        help="The prune range for rnnt loss, it means how many symbols(context)"
-        "we are using to compute the loss",
-    )
-
-    parser.add_argument(
-        "--lm-scale",
-        type=float,
-        default=0.25,
-        help="The scale to smooth the loss with lm "
-        "(output of prediction network) part.",
-    )
-
-    parser.add_argument(
-        "--am-scale",
-        type=float,
-        default=0.0,
-        help="The scale to smooth the loss with am (output of encoder network) part.",
-    )
-
-    parser.add_argument(
-        "--simple-loss-scale",
-        type=float,
-        default=0.5,
-        help="To get pruning ranges, we will calculate a simple version"
-        "loss(joiner is just addition), this simple loss also uses for"
-        "training (as a regularization item). We will scale the simple loss"
-        "with this parameter before adding to the final loss.",
-    )
-
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="The seed for random generators intended for reproducibility",
-    )
-
-    parser.add_argument(
-        "--print-diagnostics",
-        type=str2bool,
-        default=False,
-        help="Accumulate stats on activations, print them and exit.",
-    )
-
-    parser.add_argument(
-        "--inf-check",
-        type=str2bool,
-        default=False,
-        help="Add hooks to check for infinite module outputs and gradients.",
-    )
-
-    parser.add_argument(
-        "--save-every-n",
-        type=int,
-        default=4000,
-        help="""Save checkpoint after processing this number of batches"
-        periodically. We save checkpoint to exp-dir/ whenever
-        params.batch_idx_train % save_every_n == 0. The checkpoint filename
-        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
-        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
-        end of each epoch where `xxx` is the epoch number counting from 0.
-        """,
-    )
-
-    parser.add_argument(
-        "--keep-last-k",
-        type=int,
-        default=30,
-        help="""Only keep this number of checkpoints on disk.
-        For instance, if it is 3, there are only 3 checkpoints
-        in the exp-dir with filenames `checkpoint-xxx.pt`.
-        It does not affect checkpoints with name `epoch-xxx.pt`.
-        """,
-    )
-
-    parser.add_argument(
-        "--average-period",
-        type=int,
-        default=200,
-        help="""Update the averaged model, namely `model_avg`, after processing
-        this number of batches. `model_avg` is a separate version of model,
-        in which each floating-point parameter is the average of all the
-        parameters from the start of training. Each time we take the average,
-        we do: `model_avg = model * (average_period / batch_idx_train) +
-            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
-        """,
-    )
-
-    parser.add_argument(
-        "--use-fp16",
-        type=str2bool,
-        default=False,
-        help="Whether to use half precision training.",
-    )
-
-    add_model_arguments(parser)
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    """Return a dict containing training parameters.
-
-    All training related parameters that are not passed from the commandline
-    are saved in the variable `params`.
-
-    Commandline options are merged into `params` after they are parsed, so
-    you can also access them via `params`.
-
-    Explanation of options saved in `params`:
-
-        - best_train_loss: Best training loss so far. It is used to select
-                           the model that has the lowest training loss. It is
-                           updated during the training.
-
-        - best_valid_loss: Best validation loss so far. It is used to select
-                           the model that has the lowest validation loss. It is
-                           updated during the training.
-
-        - best_train_epoch: It is the epoch that has the best training loss.
-
-        - best_valid_epoch: It is the epoch that has the best validation loss.
-
-        - batch_idx_train: Used to writing statistics to tensorboard. It
-                           contains number of batches trained so far across
-                           epochs.
-
-        - log_interval:  Print training loss if batch_idx % log_interval` is 0
-
-        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
-        - valid_interval:  Run validation if batch_idx % valid_interval is 0
-
-        - feature_dim: The model input dim. It has to match the one used
-                       in computing features.
-
-        - subsampling_factor:  The subsampling factor for the model.
-
-        - encoder_dim: Hidden dim for multi-head attention model.
-
-        - num_decoder_layers: Number of decoder layer of transformer decoder.
-
-        - warm_step: The warmup period that dictates the decay of the
-              scale on "simple" (un-pruned) loss.
-    """
-    params = AttributeDict(
-        {
-            "frame_shift_ms": 10.0,
-            "allowed_excess_duration_ratio": 0.1,
-            "best_train_loss": float("inf"),
-            "best_valid_loss": float("inf"),
-            "best_train_epoch": -1,
-            "best_valid_epoch": -1,
-            "batch_idx_train": 0,
-            "log_interval": 50,
-            "reset_interval": 200,
-            "valid_interval": 3000,  # For the 100h subset, use 800
-            # parameters for zipformer
-            "feature_dim": 80,
-            "subsampling_factor": 4,  # not passed in, this is fixed.
-            "warm_step": 100,
-            "env_info": get_env_info(),
-        }
-    )
-
-    return params
-
-
-# def get_transducer_model(params: AttributeDict) -> nn.Module:
-#     encoder = get_encoder_model(params)
-#     decoder = get_decoder_model(params)
-#     joiner = get_joiner_model(params)
-
-#     model = Transducer(
-#         encoder=encoder,
-#         decoder=decoder,
-#         joiner=joiner,
-#         encoder_dim=int(params.encoder_dims.split(",")[-1]),
-#         decoder_dim=params.decoder_dim,
-#         joiner_dim=params.joiner_dim,
-#         vocab_size=params.vocab_size,
-#     )
-#     return model
-
-
-def load_checkpoint_if_available(
-    params: AttributeDict,
-    model: nn.Module,
-    model_avg: nn.Module = None,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[LRSchedulerType] = None,
-) -> Optional[Dict[str, Any]]:
-    """Load checkpoint from file.
-
-    If params.start_batch is positive, it will load the checkpoint from
-    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
-    params.start_epoch is larger than 1, it will load the checkpoint from
-    `params.start_epoch - 1`.
-
-    Apart from loading state dict for `model` and `optimizer` it also updates
-    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
-    and `best_valid_loss` in `params`.
-
-    Args:
-      params:
-        The return value of :func:`get_params`.
-      model:
-        The training model.
-      model_avg:
-        The stored model averaged from the start of training.
-      optimizer:
-        The optimizer that we are using.
-      scheduler:
-        The scheduler that we are using.
-    Returns:
-      Return a dict containing previously saved training info.
-    """
-    if params.start_batch > 0:
-        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
-    elif params.start_epoch > 1:
-        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
-    else:
-        return None
-
-    assert filename.is_file(), f"{filename} does not exist!"
-
-    saved_params = load_checkpoint(
-        filename,
-        model=model,
-        model_avg=model_avg,
-        optimizer=optimizer,
-        scheduler=scheduler,
-    )
-
-    keys = [
-        "best_train_epoch",
-        "best_valid_epoch",
-        "batch_idx_train",
-        "best_train_loss",
-        "best_valid_loss",
-    ]
-    for k in keys:
-        params[k] = saved_params[k]
-
-    if params.start_batch > 0:
-        if "cur_epoch" in saved_params:
-            params["start_epoch"] = saved_params["cur_epoch"]
-
-    return saved_params
-
-
-def save_checkpoint(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    model_avg: Optional[nn.Module] = None,
-    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[LRSchedulerType] = None,
-    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
-    rank: int = 0,
-) -> None:
-    """Save model, optimizer, scheduler and training stats to file.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The training model.
-      model_avg:
-        The stored model averaged from the start of training.
-      optimizer:
-        The optimizer used in the training.
-      sampler:
-       The sampler for the training dataset.
-      scaler:
-        The scaler used for mix precision training.
-    """
-    if rank != 0:
-        return
-    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
-    save_checkpoint_impl(
-        filename=filename,
-        model=model,
-        model_avg=model_avg,
-        params=params,
-        optimizer=optimizer,
-        scheduler=scheduler,
-        sampler=sampler,
-        scaler=scaler,
-        rank=rank,
-    )
-
-    if params.best_train_epoch == params.cur_epoch:
-        best_train_filename = params.exp_dir / "best-train-loss.pt"
-        copyfile(src=filename, dst=best_train_filename)
-
-    if params.best_valid_epoch == params.cur_epoch:
-        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
-        copyfile(src=filename, dst=best_valid_filename)
-
-def compute_loss(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    text_tokenizer_encoder: CharTokenizer,
-    batch: dict,
-    is_training: bool,
-) -> Tuple[Tensor, MetricsTracker]:
-    """
-    Compute RNN-T loss given the model and its inputs.
-
-    Args:
-      params:
-        Parameters for training. See :func:`get_params`.
-      model:
-        The model for training. It is an instance of Zipformer in our case.
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      is_training:
-        True for training. False for validation. When it is True, this
-        function enables autograd during computation; when it is False, it
-        disables autograd.
-     warmup: a floating point value which increases throughout training;
-        values >= 1.0 are fully warmed up and have all modules present.
-    """
-    # For the uneven-sized batch, the total duration after padding would possibly
-    # cause OOM. Hence, for each batch, which is sorted descendingly by length,
-    # we simply drop the last few shortest samples, so that the retained total frames
-    # (after padding) would not exceed `allowed_max_frames`:
-    # `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
-    # where `max_frames = max_duration * 1000 // frame_shift_ms`.
-    # We set allowed_excess_duration_ratio=0.1.
-    if isinstance(model, DDP):
-        # get underlying nn.Module
-        model = model.module
-    def _batch_tensors(tensors: List[Tensor], pad_value: Any) -> Tensor:
-        padding_size = max(tensor.shape[0] for tensor in tensors)
-        dims = len(tensors[0].shape)
-        padded_tensors = []
-        for tensor in tensors:
-            padding = [0] * 2 * dims
-            padding[-1] = padding_size - tensor.shape[0]
-            padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value))
-        return torch.stack([tensor for tensor in padded_tensors], dim=0)
-
-    max_frames = params.max_duration * 1000 // params.frame_shift_ms
-    allowed_max_frames = int(max_frames * (1.0 + params.allowed_excess_duration_ratio))
-    batch = filter_uneven_sized_batch(batch, allowed_max_frames)
-
-    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
-    feature = batch["inputs"]
-    # at entry, feature is (N, T, C)
-    assert feature.ndim == 3
-    feature = feature.to(device)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    batch_idx_train = params.batch_idx_train
-    warm_step = params.warm_step
-
-    texts = batch["supervisions"]["text"]
-    # remove spaces in the text
-    texts = [text.replace(" ", "") for text in texts]        
-    text_tokens_list = [torch.tensor([params.eos_idx] + text_tokenizer_encoder.encode(text) + [params.eos_idx]) for text in texts]
-    prev_outputs_tokens = _batch_tensors(
-        [tokens[:-1] for tokens in text_tokens_list], pad_value=params.pad_idx
-    )
-    target_tokens = _batch_tensors(
-        [tokens[1:] for tokens in text_tokens_list], pad_value=params.pad_idx
-    )
-    target_lengths = torch.LongTensor(
-        [tokens.shape[0] - 1 for tokens in text_tokens_list]
-    )
-    decoder_criterion = LabelSmoothingLoss(ignore_index=params.pad_idx, label_smoothing=0.1, reduction="sum")
-    ignore_prefix_size = 1 # ignroe the lang code prediction
-
-    with torch.set_grad_enabled(is_training):
-        speech_encoder_out, speech_encoder_padding_mask = model.encode_speech(
-            seqs=feature,
-            seq_lens=feature_lens,
-        )
-        #assert batch.speech_to_text.prev_output_tokens is not None
-        text_decoder_out, text_decoder_padding_mask = model.decode(
-            seqs=prev_outputs_tokens.to(device),
-            seq_lens=target_lengths.to(device),
-            encoder_output=speech_encoder_out,
-            encoder_padding_mask=speech_encoder_padding_mask,
-        )
-        text_logits = model.final_proj(text_decoder_out)
-        text_logits = text_logits[:, ignore_prefix_size:, :]
-        target_tokens = target_tokens[:, ignore_prefix_size:]
-        loss = decoder_criterion(text_logits, target_tokens.to(device))
-
-    assert loss.requires_grad == is_training
-
-    info = MetricsTracker()
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
-
-    # Note: We use reduction=sum while computing the loss.
-    info["loss"] = loss.detach().cpu().item()
-
-    return loss, info
-
-
-def compute_validation_loss(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    text_tokenizer_encoder: CharTokenizer,
-    valid_dl: torch.utils.data.DataLoader,
-    world_size: int = 1,
-) -> MetricsTracker:
-    """Run the validation process."""
-    model.eval()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(valid_dl):
-        loss, loss_info = compute_loss(
-            params=params,
-            model=model,
-            text_tokenizer_encoder=text_tokenizer_encoder,
-            batch=batch,
-            is_training=False,
-        )
-        assert loss.requires_grad is False
-        tot_loss = tot_loss + loss_info
-
-    if world_size > 1:
-        tot_loss.reduce(loss.device)
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    if loss_value < params.best_valid_loss:
-        params.best_valid_epoch = params.cur_epoch
-        params.best_valid_loss = loss_value
-
-    return tot_loss
-
-
-def train_one_epoch(
-    params: AttributeDict,
-    model: Union[nn.Module, DDP],
-    optimizer: torch.optim.Optimizer,
-    scheduler: LRSchedulerType,
-    text_tokenizer_encoder: CharTokenizer,
-    train_dl: torch.utils.data.DataLoader,
-    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
-    model_avg: Optional[nn.Module] = None,
-    tb_writer: Optional[SummaryWriter] = None,
-    world_size: int = 1,
-    rank: int = 0,
-) -> None:
-    """Train the model for one epoch.
-
-    The training loss from the mean of all frames is saved in
-    `params.train_loss`. It runs the validation process every
-    `params.valid_interval` batches.
-
-    Args:
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The model for training.
-      optimizer:
-        The optimizer we are using.
-      scheduler:
-        The learning rate scheduler, we call step() every step.
-      train_dl:
-        Dataloader for the training dataset.
-      valid_dl:
-        Dataloader for the validation dataset.
-      scaler:
-        The scaler used for mix precision training.
-      model_avg:
-        The stored model averaged from the start of training.
-      tb_writer:
-        Writer to write log messages to tensorboard.
-      world_size:
-        Number of nodes in DDP training. If it is 1, DDP is disabled.
-      rank:
-        The rank of the node in DDP training. If no DDP is used, it should
-        be set to 0.
-    """
-    model.train()
-
-    tot_loss = MetricsTracker()
-
-    for batch_idx, batch in enumerate(train_dl):
-        params.batch_idx_train += 1
-        batch_size = len(batch["supervisions"]["text"])
-
-        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
-                loss, loss_info = compute_loss(
-                    params=params,
-                    model=model,
-                    text_tokenizer_encoder=text_tokenizer_encoder,
-                    batch=batch,
-                    is_training=True,
-                )
-            # summary stats
-            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
-            # NOTE: We use reduction==sum and loss is computed over utterances
-            # in the batch and there is no normalization to it so far.
-            scaler.scale(loss).backward()
-            set_batch_count(model, params.batch_idx_train)
-            scheduler.step_batch(params.batch_idx_train)
-
-            scaler.step(optimizer)
-            scaler.update()
-            optimizer.zero_grad()
-        except:  # noqa
-            display_and_save_batch(batch, params=params)
-            raise
-
-        if params.print_diagnostics and batch_idx == 5:
-            return
-
-        if (
-            rank == 0
-            and params.batch_idx_train > 0
-            and params.batch_idx_train % params.average_period == 0
-        ):
-            update_averaged_model(
-                params=params,
-                model_cur=model,
-                model_avg=model_avg,
-            )
-
-        if (
-            params.batch_idx_train > 0
-            and params.batch_idx_train % params.save_every_n == 0
-        ):
-            save_checkpoint_with_global_batch_idx(
-                out_dir=params.exp_dir,
-                global_batch_idx=params.batch_idx_train,
-                model=model,
-                model_avg=model_avg,
-                params=params,
-                optimizer=optimizer,
-                scheduler=scheduler,
-                sampler=train_dl.sampler,
-                scaler=scaler,
-                rank=rank,
-            )
-            remove_checkpoints(
-                out_dir=params.exp_dir,
-                topk=params.keep_last_k,
-                rank=rank,
-            )
-
-        if batch_idx % 100 == 0 and params.use_fp16:
-            # If the grad scale was less than 1, try increasing it.    The _growth_interval
-            # of the grad scaler is configurable, but we can't configure it to have different
-            # behavior depending on the current grad scale.
-            cur_grad_scale = scaler._scale.item()
-            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
-                scaler.update(cur_grad_scale * 2.0)
-            if cur_grad_scale < 0.01:
-                logging.warning(f"Grad scale is small: {cur_grad_scale}")
-            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
-        if batch_idx % params.log_interval == 0:
-            cur_lr = scheduler.get_last_lr()[0]
-            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
-
-            logging.info(
-                f"Epoch {params.cur_epoch}, "
-                f"batch {batch_idx}, loss[{loss_info}], "
-                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
-                f"lr: {cur_lr:.2e}, "
-                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
-            )
-
-            if tb_writer is not None:
-                tb_writer.add_scalar(
-                    "train/learning_rate", cur_lr, params.batch_idx_train
-                )
-
-                loss_info.write_summary(
-                    tb_writer, "train/current_", params.batch_idx_train
-                )
-                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
-                if params.use_fp16:
-                    tb_writer.add_scalar(
-                        "train/grad_scale",
-                        cur_grad_scale,
-                        params.batch_idx_train,
-                    )
-
-        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
-            logging.info("Computing validation loss")
-            valid_info = compute_validation_loss(
-                params=params,
-                model=model,
-                text_tokenizer_encoder=text_tokenizer_encoder,
-                valid_dl=valid_dl,
-                world_size=world_size,
-            )
-            model.train()
-            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
-            logging.info(
-                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
-            )
-            if tb_writer is not None:
-                valid_info.write_summary(
-                    tb_writer, "train/valid_", params.batch_idx_train
-                )
-
-    loss_value = tot_loss["loss"] / tot_loss["frames"]
-    params.train_loss = loss_value
-    if params.train_loss < params.best_train_loss:
-        params.best_train_epoch = params.cur_epoch
-        params.best_train_loss = params.train_loss
-
-def run(rank, world_size, args):
-    """
-    Args:
-      rank:
-        It is a value between 0 and `world_size-1`, which is
-        passed automatically by `mp.spawn()` in :func:`main`.
-        The node with rank 0 is responsible for saving checkpoint.
-      world_size:
-        Number of GPUs for DDP training.
-      args:
-        The return value of get_parser().parse_args()
-    """
-    params = get_params()
-    params.update(vars(args))
-
-    fix_random_seed(params.seed)
-    # rank = get_rank()
-    # world_size = get_world_size()
-    # setup_dist(rank, world_size, use_ddp_launch=True)
-    setup_dist(use_ddp_launch=True)
-
-    setup_logger(f"{params.exp_dir}/log/log-train")
-    logging.info("Training started")
-
-    if args.tensorboard and rank == 0:
-        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
-    else:
-        tb_writer = None
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", rank)
-    logging.info(f"Device: {device}")
-
-
-
-
-    logging.info("About to create model")
-    model_name_or_card = "seamlessM4T_medium"
-    tokenizer_file = "./seamlessm4t/tokens.txt"
-    lang = "cmn"
-
-    # text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-    # text_tokenizer_encoder = SentencePieceEncoder(
-    #     text_tokenizer.model,
-    #     prefix_tokens=["</s>", f"__{lang}__"],
-    #     suffix_tokens=["</s>"],
-    # )
-    # #params.eos_idx = text_tokenizer.model.eos_idx
-    # params.pad_idx = text_tokenizer.model.pad_idx
-    text_tokenizer_encoder = CharTokenizer(tokenizer_file)
-    params.pad_idx, params.eos_idx = 0, 1
-    logging.info(params)
-
-    model = load_unity_model(model_name_or_card, device="cpu", dtype=torch.float32)
-    del model.t2u_model
-    del model.text_encoder
-    del model.text_encoder_frontend
-    model.text_decoder_frontend.embed = nn.Embedding(num_embeddings=text_tokenizer_encoder.vocab_size, embedding_dim=1024 ,padding_idx=0)
-    #model.text_decoder_frontend.embed = Embedding(num_embeddings=text_tokenizer_encoder.vocab_size, embedding_dim=1024 ,pad_idx=0, scaled=True)
-    #model.final_proj = TiedProjection(input_dim=1024, output_dim=text_tokenizer_encoder.vocab_size)
-    model.final_proj = nn.Linear(1024, text_tokenizer_encoder.vocab_size, bias=False)
-    for name,  param in model.named_parameters():
-        if name != 'text_decoder_frontend.embed.weight' and name != 'final_proj.weight':
-            #param.requires_grad = False
-            pass
-    model.text_decoder_frontend.embed.requires_grad = True
-    model.final_proj.requires_grad = True
-    print(model.text_decoder_frontend.embed.requires_grad, model.final_proj.requires_grad)
-    for param in model.parameters():
-        if param.requires_grad:
-            print(233333333333333333333333333333333333333333333333333333333333333333333)
-    for name, param in model.named_parameters():
-        print(name, param.requires_grad)
-    #exit(0)
-
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    assert params.save_every_n >= params.average_period
-    model_avg: Optional[nn.Module] = None
-    if rank == 0:
-        # model_avg is only used with rank 0
-        model_avg = copy.deepcopy(model).to(torch.float64)
-
-    assert params.start_epoch > 0, params.start_epoch
-    checkpoints = load_checkpoint_if_available(
-        params=params, model=model, model_avg=model_avg
-    )
-
-    model.to(device)
-    if world_size > 1:
-        logging.info("Using DDP")
-        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
-
-    #parameters_names = []
-    #parameters_names.append(
-    #    [name_param_pair[0] for name_param_pair in model.named_parameters()]
-    #)
-    # optimizer = ScaledAdam(
-    #     model.parameters(),
-    #     lr=params.base_lr,
-    #     clipping_scale=2.0,
-    #     parameters_names=parameters_names,
-    # )
-    optimizer = ScaledAdam(
-        model.parameters(),
-        lr=params.base_lr,
-        clipping_scale=2.0,
-    )
-    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
-
-    if checkpoints and "optimizer" in checkpoints:
-        logging.info("Loading optimizer state dict")
-        optimizer.load_state_dict(checkpoints["optimizer"])
-
-    if (
-        checkpoints
-        and "scheduler" in checkpoints
-        and checkpoints["scheduler"] is not None
-    ):
-        logging.info("Loading scheduler state dict")
-        scheduler.load_state_dict(checkpoints["scheduler"])
-
-    if params.print_diagnostics:
-        opts = diagnostics.TensorDiagnosticOptions(
-            2**22
-        )  # allow 4 megabytes per sub-module
-        diagnostic = diagnostics.attach_diagnostics(model, opts)
-
-    if params.inf_check:
-        register_inf_check_hooks(model)
-
-    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 20 seconds
-        #
-        # Caution: There is a reason to select 20.0 here. Please see
-        # ../local/display_manifest_statistics.py
-        #
-        # You should use ../local/display_manifest_statistics.py to get
-        # an utterance duration distribution for your dataset to select
-        # the threshold
-        if c.duration < 1.0 or c.duration > 12.0:
-            logging.warning(
-                f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
-            )
-            return False
-
-        # In pruned RNN-T, we require that T >= S
-        # where T is the number of feature frames after subsampling
-        # and S is the number of tokens in the utterance
-
-        # In ./zipformer.py, the conv module uses the following expression
-        # for subsampling
-        # T = ((c.num_frames - 7) // 2 + 1) // 2
-        # tokens = sp.encode(c.supervisions[0].text, out_type=str)
-
-        # if T < len(tokens):
-        #     logging.warning(
-        #         f"Exclude cut with ID {c.id} from training. "
-        #         f"Number of frames (before subsampling): {c.num_frames}. "
-        #         f"Number of frames (after subsampling): {T}. "
-        #         f"Text: {c.supervisions[0].text}. "
-        #         f"Tokens: {tokens}. "
-        #         f"Number of tokens: {len(tokens)}"
-        #     )
-        #     return False
-
-        return True
-
-    #aishell = AIShell(manifest_dir=args.manifest_dir)
-    #train_cuts = aishell.train_cuts()
-    #asr_datamodule = AishellAsrDataModule(args)
-    
-    aishell = AishellAsrDataModule(args)
-    # train_cuts = asr_datamodule.train_cuts()
-    # train_cuts = train_cuts.filter(remove_short_and_long_utt)
-
-    # if args.enable_musan:
-    #     cuts_musan = load_manifest(Path(args.manifest_dir) / "musan_cuts.jsonl.gz")
-    # else:
-    #     cuts_musan = None
-
-    
-
-    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
-        # We only load the sampler's state dict when it loads a checkpoint
-        # saved in the middle of an epoch
-        sampler_state_dict = checkpoints["sampler"]
-    else:
-        sampler_state_dict = None
-
-    # train_dl = asr_datamodule.train_dataloaders(
-    #     train_cuts,
-    #     on_the_fly_feats=False,
-    #     cuts_musan=cuts_musan,
-    #     sampler_state_dict=sampler_state_dict,
-    # )
-
-    # valid_cuts = aishell.valid_cuts()
-    # valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
-    train_dl = aishell.train_dataloaders(aishell.train_cuts())
-    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
-    # if not params.print_diagnostics:
-    #     scan_pessimistic_batches_for_oom(
-    #         model=model,
-    #         train_dl=train_dl,
-    #         optimizer=optimizer,
-    #         graph_compiler=graph_compiler,
-    #         params=params,
-    #     )
-
-    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
-    if checkpoints and "grad_scaler" in checkpoints:
-        logging.info("Loading grad scaler state dict")
-        scaler.load_state_dict(checkpoints["grad_scaler"])
-
-    logging.info(f"start training from epoch {params.start_epoch}")
-    for epoch in range(params.start_epoch, params.num_epochs + 1):
-        scheduler.step_epoch(epoch - 1)
-        fix_random_seed(params.seed + epoch - 1)
-        train_dl.sampler.set_epoch(epoch - 1)
-
-        if tb_writer is not None:
-            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
-        params.cur_epoch = epoch
-
-        train_one_epoch(
-            params=params,
-            model=model,
-            model_avg=model_avg,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            text_tokenizer_encoder=text_tokenizer_encoder,
-            train_dl=train_dl,
-            valid_dl=valid_dl,
-            scaler=scaler,
-            tb_writer=tb_writer,
-            world_size=world_size,
-            rank=rank,
-        )
-
-        if params.print_diagnostics:
-            diagnostic.print_diagnostics()
-            break
-
-        save_checkpoint(
-            params=params,
-            model=model,
-            model_avg=model_avg,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            sampler=train_dl.sampler,
-            scaler=scaler,
-            rank=rank,
-        )
-
-    logging.info("Done!")
-
-    if world_size > 1:
-        torch.distributed.barrier()
-        cleanup_dist()
-
-
-def display_and_save_batch(
-    batch: dict,
-    params: AttributeDict,
-) -> None:
-    """Display the batch statistics and save the batch into disk.
-
-    Args:
-      batch:
-        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
-        for the content in it.
-      params:
-        Parameters for training. See :func:`get_params`.
-    """
-    from lhotse.utils import uuid4
-
-    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
-    logging.info(f"Saving batch to {filename}")
-    torch.save(batch, filename)
-
-    supervisions = batch["supervisions"]
-    features = batch["inputs"]
-
-    logging.info(f"features shape: {features.shape}")
-
-    # y = graph_compiler.texts_to_ids(supervisions["text"])
-    # num_tokens = sum(len(i) for i in y)
-    # logging.info(f"num tokens: {num_tokens}")
-
-
-def scan_pessimistic_batches_for_oom(
-    model: Union[nn.Module, DDP],
-    train_dl: torch.utils.data.DataLoader,
-    optimizer: torch.optim.Optimizer,
-    params: AttributeDict,
-    text_tokenizer_encoder: CharTokenizer,
-):
-    from lhotse.dataset import find_pessimistic_batches
-
-    logging.info(
-        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
-    )
-    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
-    for criterion, cuts in batches.items():
-        batch = train_dl.dataset[cuts]
-        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
-                loss, _ = compute_loss(
-                    params=params,
-                    model=model,
-                    text_tokenizer_encoder=text_tokenizer_encoder,
-                    batch=batch,
-                    is_training=True,
-                )
-            loss.backward()
-            optimizer.zero_grad()
-        except Exception as e:
-            if "CUDA out of memory" in str(e):
-                logging.error(
-                    "Your GPU ran out of memory with the current "
-                    "max_duration setting. We recommend decreasing "
-                    "max_duration and trying again.\n"
-                    f"Failing criterion: {criterion} "
-                    f"(={crit_values[criterion]}) ..."
-                )
-            display_and_save_batch(batch, params=params)
-            raise
-        logging.info(
-            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
-        )
-
-
-def main():
-    parser = get_parser()
-    AishellAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    world_size = get_world_size()
-    rank = get_rank()
-    assert world_size >= 1
-
-    run(rank=rank, world_size=world_size, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
index df21a9508..efb32336a 100644
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -30,7 +30,7 @@ from lhotse.dataset import (
     DynamicBucketingSampler,
     K2SpeechRecognitionDataset,
     PrecomputedFeatures,
-    SimpleCutSampler,
+    SingleCutSampler,
     SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
@@ -176,13 +176,13 @@ class AishellAsrDataModule:
         group.add_argument(
             "--enable-musan",
             type=str2bool,
-            default=False,
+            default=True,
             help="When enabled, select noise from MUSAN and mix it"
             "with training dataset. ",
         )
 
     def train_dataloaders(
-        self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None, rank = None, world_size = None
+        self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None
     ) -> DataLoader:
         """
         Args:
@@ -192,13 +192,13 @@ class AishellAsrDataModule:
             The state dict for the training sampler.
         """
         logging.info("About to get Musan cuts")
+        cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
 
         transforms = []
         if self.args.enable_musan:
             logging.info("Enable MUSAN")
-            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
             transforms.append(
-                CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
             )
         else:
             logging.info("Disable MUSAN")
@@ -276,12 +276,10 @@ class AishellAsrDataModule:
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
                 drop_last=self.args.drop_last,
-                world_size=world_size,
-                rank=rank,
             )
         else:
-            logging.info("Using SimpleCutSampler.")
-            train_sampler = SimpleCutSampler(
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SingleCutSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
@@ -302,7 +300,7 @@ class AishellAsrDataModule:
 
         return train_dl
 
-    def valid_dataloaders(self, cuts_valid: CutSet, rank = None, world_size = None) -> DataLoader:
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
         transforms = []
         if self.args.concatenate_cuts:
             transforms = [
@@ -327,8 +325,6 @@ class AishellAsrDataModule:
             cuts_valid,
             max_duration=self.args.max_duration,
             shuffle=False,
-            rank=rank,
-            world_size=world_size,
         )
         logging.info("About to create dev dataloader")
         valid_dl = DataLoader(
diff --git a/egs/aishell/ASR/whisper/decode.py b/egs/aishell/ASR/whisper/decode.py
index 34dae7a85..371350905 100644
--- a/egs/aishell/ASR/whisper/decode.py
+++ b/egs/aishell/ASR/whisper/decode.py
@@ -473,10 +473,11 @@ def main():
     aishell = AishellAsrDataModule(args)
     test_cuts = aishell.test_cuts()
     test_dl = aishell.test_dataloaders(test_cuts)
-
-    test_sets = ["test"]
-    test_dls = [test_dl]
-
+    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
+    #test_sets = ["test"]
+    #test_dls = [test_dl]
+    test_sets = ["valid"]
+    test_dls = [valid_dl]
     for test_set, test_dl in zip(test_sets, test_dls):
         results_dict = decode_dataset(
             dl=test_dl,
diff --git a/egs/aishell/ASR/whisper/ds_config_zero1.json b/egs/aishell/ASR/whisper/ds_config_zero1.json
index cd8cbac8e..b95b1cee4 100644
--- a/egs/aishell/ASR/whisper/ds_config_zero1.json
+++ b/egs/aishell/ASR/whisper/ds_config_zero1.json
@@ -27,7 +27,7 @@
         "params": {
             "warmup_min_lr": 0,
             "warmup_max_lr": 1e-5,
-            "warmup_num_steps": 1000
+            "warmup_num_steps": 100
         }
     },
     "gradient_accumulation_steps": 1,
diff --git a/egs/aishell/ASR/whisper/train.py b/egs/aishell/ASR/whisper/train.py
index 932242ddb..6c76d3cff 100644
--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@@ -126,7 +126,7 @@ def get_parser():
     parser.add_argument(
         "--num-epochs",
         type=int,
-        default=5,
+        default=10,
         help="Number of epochs to train.",
     )