remove seamless for next PR

2025-12-11 06:55:27 +00:00 · 2024-01-15 19:34:03 +08:00 · 2024-01-15 19:34:03 +08:00 · e883bb60d4
commit e883bb60d4
parent ac53222054
20 changed files with 15 additions and 11738 deletions
--- a/egs/aishell/ASR/decode.sh
+++ b/egs/aishell/ASR/decode.sh
@ -1,9 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="2,3"
-#pip install -r seamlessm4t/requirements.txt
-#pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
-export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
-python3 seamlessm4t/decode.py --epoch 5 --exp-dir seamlessm4t/exp
-python3 seamlessm4t/decode.py --epoch 5 --avg 2 --exp-dir seamlessm4t/exp
--- a/egs/aishell/ASR/decode_whisper.sh
+++ b/egs/aishell/ASR/decode_whisper.sh
@ -1,8 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="1"
-#pip install -r whisper/requirements.txt
-#pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-#export PYTHONPATH=$PYTHONPATH:/mnt/samsung-t7/yuekai/asr/icefall/
-
-python3 whisper/decode.py --exp-dir whisper/exp --max-duration 100
--- a/egs/aishell/ASR/run.sh
+++ b/egs/aishell/ASR/run.sh
@ -1,8 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="2,3"
-pip install -r seamlessm4t/requirements.txt
-pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
-export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
-torchrun --nproc-per-node 8 seamlessm4t/train.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp_new_vocab --start-epoch 1
--- a/egs/aishell/ASR/run_whisper.sh
+++ b/egs/aishell/ASR/run_whisper.sh
@ -1,9 +0,0 @@
-
-
-pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-pip install -r whisper/requirements.txt
-export PYTHONPATH=$PYTHONPATH:/workspace/icefall
-#export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-#export PYTHONPATH=$PYTHONPATH:/mnt/samsung-t7/yuekai/asr/icefall
-
-torchrun --nproc-per-node 8 whisper/train.py --use-fp16 1 --max-duration 20 --base-lr 1e-5 --exp-dir whisper/exp_medimum --start-epoch 1
--- a/egs/aishell/ASR/seamlessm4t/asr_datamodule.py
+++ b/egs/aishell/ASR/seamlessm4t/asr_datamodule.py
@ -1 +0,0 @@
-../tdnn_lstm_ctc/asr_datamodule.py
--- a/egs/aishell/ASR/seamlessm4t/decode.py
+++ b/egs/aishell/ASR/seamlessm4t/decode.py
@ -1,415 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
-#                                            Fangjun Kuang,
-#                                            Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import torch
-import torch.nn as nn
-from asr_datamodule import AishellAsrDataModule
-#from conformer import Conformer
-
-from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint, average_checkpoints_with_averaged_model
-from icefall.decode import (
-    get_lattice,
-    nbest_decoding,
-    nbest_oracle,
-    one_best_decoding,
-    rescore_with_attention_decoder,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    get_texts,
-    setup_logger,
-    store_transcripts,
-    write_error_stats,
-)
-
-from seamless_communication.models.unity import (
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-)
-from fairseq2.generation import (
-    SequenceGeneratorOptions,
-    SequenceToTextGenerator,
-)
-from seamless_communication.models.unity.model import UnitYX2TModel
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=-1,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=1,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="beam-search",
-        help="""Decoding method.
-        Supported values are:
-            - (0) ctc-decoding. Use CTC decoding. It maps the tokens ids to
-              tokens using token symbol tabel directly.
-            - (1) 1best. Extract the best path from the decoding lattice as the
-              decoding result.
-            - (2) nbest. Extract n paths from the decoding lattice; the path
-              with the highest score is the decoding result.
-            - (3) attention-decoder. Extract n paths from the lattice,
-              the path with the highest score is the decoding result.
-            - (4) nbest-oracle. Its WER is the lower bound of any n-best
-              rescoring method can achieve. Useful for debugging n-best
-              rescoring method.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="seamlessm4t/exp",
-        help="The experiment dir",
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "subsampling_factor": 4,
-            "feature_dim": 80,
-            "nhead": 4,
-            "attention_dim": 512,
-            "num_encoder_layers": 12,
-            "num_decoder_layers": 6,
-            "vgg_frontend": False,
-            "use_feat_batchnorm": True,
-            # parameters for decoder
-            "search_beam": 20,
-            "output_beam": 7,
-            "min_active_states": 30,
-            "max_active_states": 10000,
-            "use_double_scores": True,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-    batch: dict,
-) -> Dict[str, List[List[int]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if decoding method is 1best, the key is the string `no_rescore`.
-               If attention rescoring is used, the key is the string
-               `ngram_lm_scale_xxx_attention_scale_xxx`, where `xxx` is the
-               value of `lm_scale` and `attention_scale`. An example key is
-               `ngram_lm_scale_0.7_attention_scale_0.5`
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-
-        - params.method is "1best", it uses 1best decoding without LM rescoring.
-        - params.method is "nbest", it uses nbest decoding without LM rescoring.
-        - params.method is "attention-decoder", it uses attention rescoring.
-
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID of the SOS.
-      eos_id:
-        The token ID of the EOS.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    dtype = torch.float16
-    device = torch.device("cuda", 3)
-
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-    feature = feature.to(device, dtype=dtype)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-    feature_len = supervisions["num_frames"]
-    feature_len = feature_len.to(device, dtype=dtype)
-
-    text_output = s2t_generator.generate_ex(feature, feature_len)
-    sentences = text_output.sentences
-    hyps = [sentence.bytes().decode("utf-8").split() for sentence in sentences]
-    key = "beam-search"
-
-    return {key: hyps}
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID for SOS.
-      eos_id:
-        The token ID for EOS.
-    Returns:
-      Return a dict, whose key may be "no-rescore" if the decoding method is
-      1best or it may be "ngram_lm_scale_0.7_attention_scale_0.5" if attention
-      rescoring is used. Its value is a list of tuples. Each tuple contains two
-      elements: The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    results = []
-
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            s2t_generator=s2t_generator,
-            batch=batch,
-        )
-
-        for lm_scale, hyps in hyps_dict.items():
-            this_batch = []
-            assert len(hyps) == len(texts)
-            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
-                ref_words = ref_text.split()
-                this_batch.append((cut_id, ref_words, hyp_words))
-
-            results[lm_scale].extend(this_batch)
-
-        num_cuts += len(batch["supervisions"]["text"])
-
-        if batch_idx % 100 == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-
-    enable_log = True
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
-        if enable_log:
-            logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        # we compute CER for aishell dataset.
-        results_char = []
-        for res in results:
-            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
-            )
-            test_set_wers[key] = wer
-
-        if enable_log:
-            logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
-    with open(errs_info, "w") as f:
-        print("settings\tCER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    AishellAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.update(vars(args))
-    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-    logging.info(params)
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 3)
-
-    logging.info(f"device: {device}")
-    dtype = torch.float16
-    
-    model_name_or_card = "seamlessM4T_medium"
-    #model_name_or_card = "seamlessM4T_large"
-    model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
-    del model.t2u_model
-    del model.text_encoder
-    del model.text_encoder_frontend
-    if params.epoch > 0:
-      if params.avg > 1:
-        start = params.epoch - params.avg
-        assert start >= 1, start
-        filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-        filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-        logging.info(
-            f"Calculating the averaged model over epoch range from "
-            f"{start} (excluded) to {params.epoch}"
-        )
-        model.to(device)
-        model.load_state_dict(
-            average_checkpoints_with_averaged_model(
-                filename_start=filename_start,
-                filename_end=filename_end,
-                device=device,
-            )
-        )
-      else:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    model.to(device)
-    model.eval()
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-
-    text_max_len_a = 1
-    text_max_len_b = 200
-    target_lang = "cmn"
-
-    text_opts = SequenceGeneratorOptions(
-        beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
-    )
-
-    s2t_model = UnitYX2TModel(
-        encoder_frontend=model.speech_encoder_frontend,
-        encoder=model.speech_encoder,
-        decoder_frontend=model.text_decoder_frontend,
-        decoder=model.text_decoder,
-        final_proj=model.final_proj,
-        pad_idx=model.pad_idx,
-    )
-    s2t_generator = SequenceToTextGenerator(
-        s2t_model, text_tokenizer, target_lang, text_opts
-    )
-    # we need cut ids to display recognition results.
-    args.return_cuts = True
-    aishell = AishellAsrDataModule(args)
-    test_cuts = aishell.test_cuts()
-    test_dl = aishell.test_dataloaders(test_cuts)
-
-    test_sets = ["test"]
-    test_dls = [test_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dls):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            s2t_generator=s2t_generator,
-        )
-
-        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
-    logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
--- a/egs/aishell/ASR/seamlessm4t/decode2.py
+++ b/egs/aishell/ASR/seamlessm4t/decode2.py
@ -1,432 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
-#                                            Fangjun Kuang,
-#                                            Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import torch
-import torch.nn as nn
-from asr_datamodule import AishellAsrDataModule
-#from conformer import Conformer
-from tokenizer import CharTokenizer
-from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint, average_checkpoints_with_averaged_model
-from icefall.decode import (
-    get_lattice,
-    nbest_decoding,
-    nbest_oracle,
-    one_best_decoding,
-    rescore_with_attention_decoder,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
-    AttributeDict,
-    get_texts,
-    setup_logger,
-    store_transcripts,
-    write_error_stats,
-)
-
-from seamless_communication.models.unity import (
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-)
-from fairseq2.generation import (
-    SequenceGeneratorOptions,
-    SequenceToTextGenerator,
-)
-from seamless_communication.models.unity.model import UnitYX2TModel
-from fairseq2.nn.embedding import Embedding
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument(
-        "--epoch",
-        type=int,
-        default=-1,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
-    )
-    parser.add_argument(
-        "--avg",
-        type=int,
-        default=1,
-        help="Number of checkpoints to average. Automatically select "
-        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--method",
-        type=str,
-        default="beam-search",
-        help="""Decoding method.
-        Supported values are:
-            - (0) ctc-decoding. Use CTC decoding. It maps the tokens ids to
-              tokens using token symbol tabel directly.
-            - (1) 1best. Extract the best path from the decoding lattice as the
-              decoding result.
-            - (2) nbest. Extract n paths from the decoding lattice; the path
-              with the highest score is the decoding result.
-            - (3) attention-decoder. Extract n paths from the lattice,
-              the path with the highest score is the decoding result.
-            - (4) nbest-oracle. Its WER is the lower bound of any n-best
-              rescoring method can achieve. Useful for debugging n-best
-              rescoring method.
-        """,
-    )
-
-    parser.add_argument(
-        "--exp-dir",
-        type=str,
-        default="seamlessm4t/exp",
-        help="The experiment dir",
-    )
-
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "subsampling_factor": 4,
-            "feature_dim": 80,
-            "nhead": 4,
-            "attention_dim": 512,
-            "num_encoder_layers": 12,
-            "num_decoder_layers": 6,
-            "vgg_frontend": False,
-            "use_feat_batchnorm": True,
-            # parameters for decoder
-            "search_beam": 20,
-            "output_beam": 7,
-            "min_active_states": 30,
-            "max_active_states": 10000,
-            "use_double_scores": True,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def decode_one_batch(
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-    batch: dict,
-) -> Dict[str, List[List[int]]]:
-    """Decode one batch and return the result in a dict. The dict has the
-    following format:
-
-        - key: It indicates the setting used for decoding. For example,
-               if decoding method is 1best, the key is the string `no_rescore`.
-               If attention rescoring is used, the key is the string
-               `ngram_lm_scale_xxx_attention_scale_xxx`, where `xxx` is the
-               value of `lm_scale` and `attention_scale`. An example key is
-               `ngram_lm_scale_0.7_attention_scale_0.5`
-        - value: It contains the decoding result. `len(value)` equals to
-                 batch size. `value[i]` is the decoding result for the i-th
-                 utterance in the given batch.
-    Args:
-      params:
-        It's the return value of :func:`get_params`.
-
-        - params.method is "1best", it uses 1best decoding without LM rescoring.
-        - params.method is "nbest", it uses nbest decoding without LM rescoring.
-        - params.method is "attention-decoder", it uses attention rescoring.
-
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      batch:
-        It is the return value from iterating
-        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
-        for the format of the `batch`.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID of the SOS.
-      eos_id:
-        The token ID of the EOS.
-    Returns:
-      Return the decoding result. See above description for the format of
-      the returned dict.
-    """
-    dtype = torch.float16
-    device = torch.device("cuda", 3)
-
-    feature = batch["inputs"]
-    assert feature.ndim == 3
-    feature = feature.to(device, dtype=dtype)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-    feature_len = supervisions["num_frames"]
-    feature_len = feature_len.to(device, dtype=dtype)
-
-    text_output = s2t_generator.generate_ex(feature, feature_len)
-    #sentences = text_output.sentences
-    #hyps = [sentence.bytes().decode("utf-8").split() for sentence in sentences]
-
-    token_ids = text_output.generator_output.results
-    hyps_ids = [sentence[0].seq.cpu().tolist() for sentence in token_ids]
-    hyps = [params.tokenizer.decode(hyps_id).split() for hyps_id in hyps_ids]
-
-    key = "beam-search"
-
-    return {key: hyps}
-
-
-def decode_dataset(
-    dl: torch.utils.data.DataLoader,
-    params: AttributeDict,
-    s2t_generator: SequenceToTextGenerator,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
-    """Decode dataset.
-
-    Args:
-      dl:
-        PyTorch's dataloader containing the dataset to decode.
-      params:
-        It is returned by :func:`get_params`.
-      model:
-        The neural model.
-      HLG:
-        The decoding graph. Used when params.method is NOT ctc-decoding.
-      H:
-        The ctc topo. Used only when params.method is ctc-decoding.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
-      sos_id:
-        The token ID for SOS.
-      eos_id:
-        The token ID for EOS.
-    Returns:
-      Return a dict, whose key may be "no-rescore" if the decoding method is
-      1best or it may be "ngram_lm_scale_0.7_attention_scale_0.5" if attention
-      rescoring is used. Its value is a list of tuples. Each tuple contains two
-      elements: The first is the reference transcript, and the second is the
-      predicted result.
-    """
-    results = []
-
-    num_cuts = 0
-
-    try:
-        num_batches = len(dl)
-    except TypeError:
-        num_batches = "?"
-
-    results = defaultdict(list)
-    for batch_idx, batch in enumerate(dl):
-        texts = batch["supervisions"]["text"]
-        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-
-        hyps_dict = decode_one_batch(
-            params=params,
-            s2t_generator=s2t_generator,
-            batch=batch,
-        )
-
-        for lm_scale, hyps in hyps_dict.items():
-            this_batch = []
-            assert len(hyps) == len(texts)
-            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
-                ref_words = ref_text.split()
-                this_batch.append((cut_id, ref_words, hyp_words))
-
-            results[lm_scale].extend(this_batch)
-
-        num_cuts += len(batch["supervisions"]["text"])
-
-        if batch_idx % 100 == 0:
-            batch_str = f"{batch_idx}/{num_batches}"
-
-            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
-    return results
-
-
-def save_results(
-    params: AttributeDict,
-    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-
-    enable_log = True
-    test_set_wers = dict()
-    for key, results in results_dict.items():
-        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
-        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
-        if enable_log:
-            logging.info(f"The transcripts are stored in {recog_path}")
-
-        # The following prints out WERs, per-word error statistics and aligned
-        # ref/hyp pairs.
-        errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
-        # we compute CER for aishell dataset.
-        results_char = []
-        for res in results:
-            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
-        with open(errs_filename, "w") as f:
-            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
-            )
-            test_set_wers[key] = wer
-
-        if enable_log:
-            logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
-    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
-    errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
-    with open(errs_info, "w") as f:
-        print("settings\tCER", file=f)
-        for key, val in test_set_wers:
-            print("{}\t{}".format(key, val), file=f)
-
-    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
-    note = "\tbest for {}".format(test_set_name)
-    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
-        note = ""
-    logging.info(s)
-
-
-@torch.no_grad()
-def main():
-    parser = get_parser()
-    AishellAsrDataModule.add_arguments(parser)
-    args = parser.parse_args()
-    args.exp_dir = Path(args.exp_dir)
-
-    params = get_params()
-    params.tokenizer = CharTokenizer('./seamlessm4t/tokens.txt')
-    params.update(vars(args))
-    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}")
-    logging.info("Decoding started")
-    logging.info(params)
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 3)
-
-    logging.info(f"device: {device}")
-    dtype = torch.float16
-    
-    model_name_or_card = "seamlessM4T_medium"
-    #model_name_or_card = "seamlessM4T_large"
-    model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
-    del model.t2u_model
-    del model.text_encoder
-    del model.text_encoder_frontend
-    model.text_decoder_frontend.embed = nn.Embedding(num_embeddings=params.tokenizer.vocab_size, embedding_dim=1024 ,padding_idx=0)
-    #model.text_decoder_frontend.embed = Embedding(num_embeddings=params.tokenizer.vocab_size, embedding_dim=1024 ,pad_idx=0, scaled=True)
-    model.final_proj = nn.Linear(1024, params.tokenizer.vocab_size, bias=False)
-    #model.final_proj = nn.Linear(1024, params.tokenizer.vocab_size)
-    if params.epoch > 0:
-      if params.avg > 1:
-        start = params.epoch - params.avg
-        assert start >= 1, start
-        filename_start = f"{params.exp_dir}/epoch-{start}.pt"
-        filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
-        logging.info(
-            f"Calculating the averaged model over epoch range from "
-            f"{start} (excluded) to {params.epoch}"
-        )
-        model.to(device)
-        model.load_state_dict(
-            average_checkpoints_with_averaged_model(
-                filename_start=filename_start,
-                filename_end=filename_end,
-                device=device,
-            )
-        )
-      else:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
-    model.to(device)
-    model.eval()
-    model.half()
-    #for param in model.parameters():
-    #    if param.dtype == torch.float16:
-    #        pass
-    #    else:
-    #        param.data = param.data.to(torch.float16)
-            #print(param)
-    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"Number of model parameters: {num_param}")
-
-    text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-
-    text_max_len_a = 1
-    text_max_len_b = 200
-    target_lang = "cmn"
-
-    text_opts = SequenceGeneratorOptions(
-        beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
-    )
-
-    s2t_model = UnitYX2TModel(
-        encoder_frontend=model.speech_encoder_frontend,
-        encoder=model.speech_encoder,
-        decoder_frontend=model.text_decoder_frontend,
-        decoder=model.text_decoder,
-        final_proj=model.final_proj,
-        pad_idx=model.pad_idx,
-    )
-    s2t_generator = SequenceToTextGenerator(
-        s2t_model, text_tokenizer, target_lang, text_opts
-    )
-    # we need cut ids to display recognition results.
-    args.return_cuts = True
-    aishell = AishellAsrDataModule(args)
-    test_cuts = aishell.test_cuts()
-    test_dl = aishell.test_dataloaders(test_cuts)
-
-    test_sets = ["test"]
-    test_dls = [test_dl]
-
-    for test_set, test_dl in zip(test_sets, test_dls):
-        results_dict = decode_dataset(
-            dl=test_dl,
-            params=params,
-            s2t_generator=s2t_generator,
-        )
-
-        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
-    logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
-    main()
--- a/egs/aishell/ASR/seamlessm4t/label_smoothing.py
+++ b/egs/aishell/ASR/seamlessm4t/label_smoothing.py
@ -1 +0,0 @@
-../../../librispeech/ASR/conformer_ctc/label_smoothing.py
--- a/egs/aishell/ASR/seamlessm4t/model.py
+++ b/egs/aishell/ASR/seamlessm4t/model.py
@ -1,133 +0,0 @@
-import torch
-import torch.nn as nn
-from fairseq2.nn.embedding import Embedding
-from seamless_communication.models.inference import Translator
-from seamless_communication.models.unity import (
-    UnitTokenizer,
-    UnitYModel,
-    load_unity_model,
-    load_unity_text_tokenizer,
-    load_unity_unit_tokenizer,
-)
-from fairseq2.generation import (
-    Seq2SeqGenerator,
-    SequenceGeneratorOptions,
-    SequenceGeneratorOutput,
-    SequenceToTextGenerator,
-    SequenceToTextOutput,
-)
-from seamless_communication.models.unity.model import UnitYModel, UnitYX2TModel
-
-import torchaudio
-import torchaudio.compliance.kaldi as ta_kaldi
-audio_file="/mnt/samsung-t7/yuekai/asr/Triton-ASR-Client/datasets/mini_en/wav/1089-134686-0001.wav"
-src_lang="cmn"
-
-audio_file="/mnt/samsung-t7/yuekai/asr/Triton-ASR-Client/datasets/mini_zh/wav/long.wav"
-src_lang="eng"
-target_lang = "cmn"
-
-audio_input = torchaudio.load(audio_file)[0]
-feature = ta_kaldi.fbank(audio_input, num_mel_bins=80)
-# feature shape is (T, F), convert it to (B, T, F), source_seq_lens tracks T 
-source_seqs = feature.unsqueeze(0)
-source_seq_lens = torch.tensor([feature.shape[0]])
-
-# Initialize a Translator object with a multitask model, vocoder on the GPU.
-
-
-# translator = Translator("seamlessM4T_medium", vocoder_name_or_card="vocoder_36langs", device=torch.device("cuda:2"), dtype=torch.float16)
-
-# transcribed_text, _, _ = translator.predict(audio_file, "asr", src_lang)
-
-# print(transcribed_text)
-
-
-model_name_or_card = "seamlessM4T_medium"
-device = torch.device("cuda:3")
-
-# cast source_seq_lens, source_seqs to device, dtype to torch.float16
-source_seq_lens = source_seq_lens.to(device=device, dtype=torch.float16)
-source_seqs = source_seqs.to(device=device, dtype=torch.float16)
-
-
-
-dtype = torch.float16
-model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
-model.eval()
-model.text_decoder_frontend.embed = Embedding(num_embeddings=6257, embedding_dim=1024 ,pad_idx=0, scaled=True)
-model.final_proj = nn.Linear(1024, 6257)
-model.half()
-print(model.text_decoder_frontend.embed, model.text_encoder_frontend.embed.weight.dtype, type(model.text_encoder_frontend.embed), type(model.text_encoder_frontend.embed.weight))
-print(model.final_proj, model.final_proj.weight.dtype, type(model.final_proj), type(model.final_proj.weight))
-#input()
-exit(0)
-text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-#print(text_tokenizer.model.eos_idx, text_tokenizer.model.pad_idx)
-#text_tokenizer_encoder = text_tokenizer.create_encoder(lang=target_lang, mode="target")
-#text_tokenizer_decoder = text_tokenizer.create_decoder()
-# print attritbut of text_tokenizer_encoder
-#print(text_tokenizer.vocab_info)
-#print(text_tokenizer_encoder("其中广州深圳甚至出现了多个日光盘"))
-#print(text_tokenizer_decoder(torch.tensor([3,256200,137139,252603,250476,250590,1,84778,148897,249568,249352,249947,249050,250520,254508])))
-
-# store all vocab in a file
-# with open("vocab.txt", "w") as f:
-#     for i in range(256206):
-#         f.write(f"{i}: " + text_tokenizer_decoder(torch.tensor([i]))[0].bytes().decode("utf-8")+ "\n")
-#     f.close()
-# exit(0)
-
-
-
-# def decode(
-#     self,
-#     seqs: Tensor,
-#     seq_lens: Optional[Tensor],
-#     encoder_output: Tensor,
-#     encoder_padding_mask: Optional[Tensor],
-#     state_bag: Optional[IncrementalStateBag] = None,
-# ) -> Tuple[Tensor, Optional[Tensor]]:
-#     seqs, padding_mask = self.text_decoder_frontend(seqs, seq_lens, state_bag)
-
-#     return self.text_decoder(  # type: ignore[no-any-return]
-#         seqs, padding_mask, encoder_output, encoder_padding_mask, state_bag
-#     )
-
-# def decoding(model, feature):
-#     seqs, padding_mask = model.speech_encoder_frontend(seqs, seq_lens)
-#     speech_encoder(seqs, padding_mask)
-
-#     decoder_output, decoder_padding_mask = self.decode(
-#         batch.target_seqs,
-#         batch.target_seq_lens,
-#         encoder_output,
-#         encoder_padding_mask,
-#     )
-
-#     text_logits = model.final_project(decoder_output, decoder_padding_mask)
-
-text_max_len_a = 1
-text_max_len_b = 200
-
-text_opts = SequenceGeneratorOptions(
-    beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
-)
-
-s2t_model = UnitYX2TModel(
-    encoder_frontend=model.speech_encoder_frontend,
-    encoder=model.speech_encoder,
-    decoder_frontend=model.text_decoder_frontend,
-    decoder=model.text_decoder,
-    final_proj=model.final_proj,
-    pad_idx=model.pad_idx,
-)
-s2t_generator = SequenceToTextGenerator(
-    s2t_model, text_tokenizer, target_lang, text_opts
-)
-
-text_output = s2t_generator.generate_ex(source_seqs, source_seq_lens)
-print(text_output.generator_output.results[0][0].seq.cpu().tolist())
-# sentence = text_output.sentences[0]
-# print(sentence, type(sentence))
-# sentence = sentence.bytes().decode("utf-8")
--- a/egs/aishell/ASR/seamlessm4t/optim.py
+++ b/egs/aishell/ASR/seamlessm4t/optim.py
--- a/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py
+++ b/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py
@ -1,694 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union, cast
-
-import torch
-from torch import Tensor
-from torch.nn.functional import log_softmax
-
-from fairseq2.data import Collater, SequenceData, VocabularyInfo
-from fairseq2.generation.beam_search import BeamSearch, StandardBeamSearch
-from fairseq2.generation.logits_processor import LogitsProcessor
-from fairseq2.models.encoder_decoder import Seq2SeqDecoder
-from fairseq2.nn.incremental_state import IncrementalStateBag
-from fairseq2.typing import Device
-
-
-@dataclass
-class SequenceGeneratorOptions:
-    """Holds the options to pass to a sequence generator."""
-
-    beam_size: int = 5
-    """The beam size."""
-
-    min_seq_len: int = 1
-    """The minimum length of generated sequences (including prefix sequence)."""
-
-    soft_max_seq_len: Optional[Tuple[int, int]] = (1, 200)
-    """The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source
-    sequence length. The generated sequences (including prefix sequence) will
-    have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
-    ``hard_max_seq_len``."""
-
-    hard_max_seq_len: int = 1024
-    """The hard limit on maximum length of generated sequences."""
-
-    len_penalty: float = 1.0
-    """The length penalty, where values less than 1.0 favor shorter, values
-    greater than 1.0 favor longer sequences."""
-
-    unk_penalty: float = 0.0
-    """The unknown symbol penalty, where values less than 0 produce more UNKs,
-    values greater than 0 produce fewer UNKs."""
-
-    normalize_scores: bool = True
-    """If ``True``, normalizes scores by the length of generated sequences."""
-
-    search: Optional[BeamSearch] = None
-    """The beam search algorithm to use."""
-
-    logits_processor: Optional[LogitsProcessor] = None
-    """Logits processor called before applying beam search step."""
-
-
-class Seq2SeqGenerator:
-    """Represents a sequence-to-sequence generator."""
-
-    decoder: Seq2SeqDecoder
-    opts: SequenceGeneratorOptions
-    beam_size: int
-    eos_idx: int
-    pad_idx: Optional[int]
-    unk_idx: Optional[int]
-    prefix_seq: Union[int, Tensor]
-    prefix_seq_len: int
-    search: BeamSearch
-    logits_processor: Optional[LogitsProcessor]
-    collater: Collater
-
-    def __init__(
-        self,
-        decoder: Seq2SeqDecoder,
-        vocab_info: VocabularyInfo,
-        prefix_seq: Optional[Union[int, Tensor]],
-        opts: Optional[SequenceGeneratorOptions] = None,
-    ) -> None:
-        """
-        :param decoder:
-            The decoder to use.
-        :param vocab_info:
-            The vocabulary information to use.
-        :param prefix_seq:
-            The prefix sequence, typically one or more control symbols
-            indicating the beginning of a sequence. *Shape:* :math:`()` or
-            :math:`(S)`, where :math:`S` is the sequence length. If ``None``,
-            the EOS symbol will be used as prefix.
-        :param opts:
-            The generation options.
-        """
-        self.decoder = decoder
-
-        self.opts = opts or SequenceGeneratorOptions()
-
-        # Set beam size.
-        if vocab_info.pad_idx is None:
-            self.beam_size = min(self.opts.beam_size, vocab_info.size)
-        else:
-            # -1 since we never select PAD.
-            self.beam_size = min(self.opts.beam_size, vocab_info.size - 1)
-
-        if vocab_info.eos_idx is None:
-            raise ValueError(
-                "`vocab_info` must have `eos_idx` set for sequence generation."
-            )
-
-        # Set vocab info.
-        self.eos_idx = 1
-        #self.eos_idx = vocab_info.eos_idx
-        self.unk_idx = 2
-        #self.unk_idx = vocab_info.unk_idx
-        self.pad_idx = 0
-        #self.pad_idx = vocab_info.pad_idx
-
-        # Set prefix sequence.
-        if 1:
-        #if prefix_seq is None:
-            # If `None`, we follow fairseq's convention, and use EOS as the
-            # prefix.
-            self.prefix_seq, self.prefix_seq_len = self.eos_idx, 1
-        else:
-            self.prefix_seq = prefix_seq
-
-            if isinstance(prefix_seq, Tensor):
-                num_dim = prefix_seq.dim()
-
-                if num_dim >= 2:
-                    raise ValueError(
-                        f"`prefix_seq` must be a scalar or a 1-dimensional tensor, but is {num_dim}-dimensional instead."
-                    )
-
-                self.prefix_seq_len = 1 if num_dim == 0 else prefix_seq.size(0)
-            else:
-                self.prefix_seq_len = 1
-
-        # Set beam search.
-        self.search = self.opts.search or StandardBeamSearch()
-        self.logits_processor = self.opts.logits_processor
-
-        if vocab_info.pad_idx is None:
-            self.collater = Collater()
-        else:
-            self.collater = Collater(self.pad_idx, pad_to_multiple=2)
-
-    @torch.inference_mode()
-    def __call__(
-        self,
-        encoder_output: Tensor,
-        encoder_padding_mask: Optional[Tensor],
-        source_seq_len: Optional[int] = None,
-    ) -> "SequenceGeneratorOutput":
-        opts = self.opts
-
-        num_searches = encoder_output.size(0)
-
-        beam_size = opts.beam_size
-
-        max_seq_len = self._determine_max_seq_len(source_seq_len)
-
-        device = encoder_output.device
-
-        encoder_output, encoder_padding_mask = self._fan_out_encoder_output(
-            encoder_output, encoder_padding_mask
-        )
-
-        # Each element contains the id of the search corresponding to a single
-        # source sequence and its hypotheses.
-        active_searches: List[Tuple[int, List[Hypothesis]]] = [
-            (search_idx, []) for search_idx in range(num_searches)
-        ]
-
-        # Once a source sequence has `beam_size` hypotheses, its search is moved
-        # from `active_searches` to `finished_searches`.
-        finished_searches: List[List[Hypothesis]] = [[] for i in range(num_searches)]
-
-        num_remaining_searches = num_searches
-
-        # Initialize buffers.
-        # (N x B, S)
-        seqs = torch.zeros(
-            (num_searches * beam_size, max_seq_len), device=device, dtype=torch.int64
-        )
-
-        # (N x B, S)
-        scores = torch.zeros(
-            (num_searches * beam_size, max_seq_len), device=device, dtype=torch.float32
-        )
-
-        # A list that indicates beams that should be ignored in the next step.
-        ignored_beam_mask = torch.full(
-            (num_searches, beam_size), False, device=device, dtype=torch.bool
-        )
-
-        # An offset array for converting between batch-wide and search-local
-        # beam indices.
-        # (B)
-        search_offsets = torch.arange(num_searches, device=device) * beam_size
-
-        # (B) -> (B, 1)
-        search_offsets.unsqueeze_(-1)
-
-        cand_offsets = torch.arange(2 * beam_size, device=device)
-
-        state_bag = IncrementalStateBag()
-
-        # At this point, the state is fully initialized, kick off the search.
-        self._bootstrap_seqs_and_scores(
-            seqs, scores, encoder_output, encoder_padding_mask, state_bag
-        )
-
-        start_step = self.prefix_seq_len - 1
-
-        # Holds the indices of beams (a beam can occur more than once) that we
-        # should continue with in the next step.
-        beam_indices: Optional[Tensor] = None
-
-        # Holds the indices of searches that we should continue with in the next
-        # step. If not `None`, it means we finalized one or more searches in the
-        # last step.
-        search_indices: Optional[Tensor] = None
-
-        for step_nr in range(start_step, max_seq_len - 1):
-            if beam_indices is not None:
-                # If not `None`, it means in the last step we finalized one or
-                # more searches. We should ensure that we adjust `beam_indices`
-                # before reordering `decoder`'s incremental state.
-                if search_indices is not None:
-                    num_searches = search_indices.numel()
-
-                    # (N)
-                    delta = search_indices - torch.arange(num_searches, device=device)
-
-                    # (N) -> (N, 1)
-                    delta.unsqueeze_(-1)
-
-                    # Adjust indices to take into account removed searches.
-                    beam_indices.view(num_searches, beam_size).add_(delta * beam_size)
-
-                state_bag.reorder(beam_indices)
-
-            decoder_output, decoder_padding_mask = self.decoder.decode(
-                seqs[:, step_nr : step_nr + 1],
-                None,  # We never generate PAD.
-                encoder_output,
-                encoder_padding_mask,
-                state_bag,
-            )
-
-            state_bag.increment_step()
-
-            model_output = self.decoder.project(decoder_output, decoder_padding_mask)
-
-            # lprobs:          (1, V)
-            # model_output: (N, 1, V)
-            lprobs = log_softmax(model_output.logits, dim=-1, dtype=torch.float32)
-
-            # Do not allow EOS before reaching the minimum sequence length.
-            if step_nr < self.opts.min_seq_len:
-                lprobs[:, :, self.eos_idx] = -torch.inf
-
-            # fmt: off
-            # If we have reached the maximum length, force the last step to be
-            # EOS.
-            if step_nr == max_seq_len - 2:
-                lprobs[:, :, : self.eos_idx]       = -torch.inf
-                lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
-            # fmt: on
-
-            # Never allow PAD.
-            if self.pad_idx is not None:
-                lprobs[:, :, self.pad_idx] = -torch.inf
-
-            # Apply UNK penalty.
-            if self.unk_idx is not None:
-                lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
-
-            # update scores in place using logits_processor
-            if self.logits_processor is not None:
-                self.logits_processor(
-                    seqs.view(num_searches, beam_size, -1)[:, :, : step_nr + 1],
-                    lprobs.view(num_searches, beam_size, -1),
-                )
-
-            # Determine candidates for the next step.
-            # (N, 2 x B)
-            cand_scores, cand_indices, cand_beam_indices = self.search.step(
-                step_nr,
-                step_nr == start_step,
-                lprobs.view(num_searches, beam_size, -1),
-                scores.view(num_searches, beam_size, -1)[:, :, : step_nr + 1],
-            )
-
-            # Convert search-local beam indices to batch-wide beam indices.
-            # (N, 2 x B) + (N) -> (N, 2 x B)
-            global_cand_beam_indices = cand_beam_indices + search_offsets
-
-            # Finalize beams that reached the minimum length and that end with
-            # an EOS.
-            # (N, 2 x B)
-            eos_mask = (cand_indices == self.eos_idx) & (cand_scores != -math.inf)
-
-            # Do not attempt to finalize beams that should be ignored.
-            eos_mask[:, :beam_size][ignored_beam_mask] = False
-
-            # Only consider EOS when it's among the top `beam_size` indices. Now
-            # we know what beam(s) to finalize.
-            # (N, B)
-            eos_beam_indices = torch.masked_select(
-                global_cand_beam_indices[:, :beam_size], mask=eos_mask[:, :beam_size]
-            )
-
-            if eos_beam_indices.numel() > 0:
-                # Select the scores of the finalized beams.
-                # (N, B)
-                eos_scores = torch.masked_select(
-                    cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
-                )
-
-                newly_finished_searches = self._finalize_hypothesis(
-                    step_nr,
-                    eos_beam_indices,
-                    eos_scores,
-                    seqs,
-                    scores,
-                    active_searches,
-                    finished_searches,
-                )
-
-                num_remaining_searches -= len(newly_finished_searches)
-
-                if num_remaining_searches == 0:
-                    break
-            else:
-                newly_finished_searches = None
-
-            # Remove finished searches (ones for which `beam_size` finalized
-            # beams have been generated) from the batch.
-            if newly_finished_searches:
-                new_num_searches = num_searches - len(newly_finished_searches)
-
-                # Construct `search_indices` which holds indices of searches
-                # to keep for the next step.
-                search_mask = torch.full((num_searches,), True, device=device)
-
-                search_mask[newly_finished_searches] = False
-
-                search_indices = torch.arange(num_searches, device=device)
-
-                search_indices = search_indices.masked_select(search_mask)
-
-                # fmt: off
-                # Filter out removed batches from state variables.
-                # (N, B) -> (N - F, B)
-                ignored_beam_mask = ignored_beam_mask[search_indices]
-
-                # (N, 2 x B) -> (N - F, 2 x B)
-                cand_scores       = cand_scores      [search_indices]
-                cand_indices      = cand_indices     [search_indices]
-                cand_beam_indices = cand_beam_indices[search_indices]
-
-                # (N) -> (N - F)
-                search_offsets.resize_(new_num_searches, 1)
-
-                # (N - F, 2 x B) + (N - F) -> (N - F, 2 x B)
-                global_cand_beam_indices = cand_beam_indices + search_offsets
-
-                # (N, 2 x B) -> (N - F, 2 x B)
-                eos_mask = eos_mask[search_indices]
-
-                # (N x B, S) -> (N, B, S)
-                seqs   = seqs  .view(num_searches, -1)
-                scores = scores.view(num_searches, -1)
-
-                # (N, B, S + 1) -> ((N - F) x B, S)
-                seqs   = seqs  [search_indices].view(new_num_searches * beam_size, -1)
-                scores = scores[search_indices].view(new_num_searches * beam_size, -1)
-
-                # (N x B, S_enc, M) -> (N, B, S_enc, M)
-                encoder_output = encoder_output.unflatten(0, (num_searches, -1))
-
-                # (N, B, S_enc, M) -> ((N - F) x B, S_enc, M)
-                encoder_output = encoder_output[search_indices].flatten(0, 1)
-
-                if encoder_padding_mask is not None:
-                    # (N x B, S_enc, M) -> (N, B, S_enc, M)
-                    padding_mask = encoder_padding_mask.unflatten(0, (num_searches, -1))
-
-                    # (N, B, S_enc, M) -> ((N - F) x B, S_enc, M)
-                    encoder_padding_mask = padding_mask[search_indices].flatten(0, 1)
-                # fmt: on
-
-                num_searches = new_num_searches
-            else:
-                search_indices = None
-
-            eos_mask[:, :beam_size][ignored_beam_mask] = True
-
-            # Set `beam_weights` so that values greater than or equal to 2 x
-            # `beam_size` indicate finished beams (i.e. end with EOS) and values
-            # less than 2 x `beam_size` indicate active beams.
-            # (N, 2 x B)
-            beam_weights = cand_offsets + (eos_mask * (2 * beam_size))
-
-            # Get the top `beam_size` active beams, which are the beams with the
-            # smallest weights in `active_beam_weights`.
-            # (N, B)
-            active_beam_weights, active_beams = torch.topk(
-                beam_weights, k=beam_size, dim=1, largest=False
-            )
-
-            # Update to ignore finalized beams in the next step.
-            # (N, B)
-            ignored_beam_mask = active_beam_weights >= 2 * beam_size
-
-            # We should always have at least one active beam in each search.
-            assert (~ignored_beam_mask).any(dim=1).all()
-
-            # Denotes which beams are continued for each new hypothesis (a beam
-            # can be selected more than once).
-            # (N, B)
-            beam_indices = torch.gather(
-                global_cand_beam_indices, dim=1, index=active_beams
-            )
-
-            # (N, B) -> (N x B)
-            beam_indices = beam_indices.view(-1)
-
-            # fmt: off
-            # Reorder beams in the `seq` and `score` buffers. The same beam can
-            # be selected more than once.
-            if step_nr > start_step:
-                seqs  [:, : step_nr + 1] = torch.index_select(
-                    seqs  [:, : step_nr + 1], dim=0, index=beam_indices
-                )
-                scores[:, : step_nr + 1] = torch.index_select(
-                    scores[:, : step_nr + 1], dim=0, index=beam_indices
-                )
-
-            # (N x B, S) -> (N, B, S)
-            seqs_view   = seqs  .view(num_searches, beam_size, -1)
-            scores_view = scores.view(num_searches, beam_size, -1)
-
-            seqs_view  [:, :, step_nr + 1] = torch.gather(cand_indices, dim=1, index=active_beams)
-            scores_view[:, :, step_nr + 1] = torch.gather(cand_scores,  dim=1, index=active_beams)
-            # fmt: on
-
-        # Ensure that hypotheses are sorted by their scores before returning.
-        for batch in finished_searches:
-            batch.sort(key=lambda b: b.score, reverse=True)  # type: ignore[arg-type, return-value]
-
-        return SequenceGeneratorOutput(
-            results=finished_searches, device=device, collater=self.collater
-        )
-
-    def _determine_max_seq_len(self, source_seq_len: Optional[int]) -> int:
-        opts = self.opts
-
-        if source_seq_len is None or opts.soft_max_seq_len is None:
-            max_seq_len = opts.hard_max_seq_len
-        else:
-            at, bt = opts.soft_max_seq_len
-
-            max_seq_len = min(opts.hard_max_seq_len, int(at * source_seq_len + bt))
-
-        if opts.min_seq_len > max_seq_len:
-            raise ValueError(
-                f"The effective maximum sequence length must be greater than or equal to `min_seq_len` ({opts.min_seq_len}), but is {max_seq_len} instead. Adjust your soft and hard maximum sequence length limits."
-            )
-
-        if self.prefix_seq_len >= max_seq_len:
-            raise ValueError(
-                f"The effective maximum sequence length must be greater than `prefix_seq_len` ({self.prefix_seq_len}), but is {max_seq_len} instead."
-            )
-
-        return max_seq_len
-
-    def _fan_out_encoder_output(
-        self, encoder_output: Tensor, encoder_padding_mask: Optional[Tensor]
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        num_searches = encoder_output.size(0)  # i.e. batch size
-
-        # Fan out `encoder_output` to `num_searches` x `beam_size`.
-        # (N)
-        fan_out_indices = torch.arange(num_searches, device=encoder_output.device)
-
-        # (N) -> (N x B)
-        fan_out_indices = fan_out_indices.repeat_interleave(self.beam_size)
-
-        # (N, S_enc, M) -> (N x B, S_enc, M)
-        encoder_output = encoder_output.index_select(dim=0, index=fan_out_indices)
-
-        # (N, S_enc, M) -> (N x B, S_enc, M)
-        if encoder_padding_mask is not None:
-            encoder_padding_mask = encoder_padding_mask.index_select(
-                dim=0, index=fan_out_indices
-            )
-
-        return encoder_output, encoder_padding_mask
-
-    def _bootstrap_seqs_and_scores(
-        self,
-        seqs: Tensor,
-        scores: Tensor,
-        encoder_output: Tensor,
-        encoder_padding_mask: Optional[Tensor],
-        state_bag: IncrementalStateBag,
-    ) -> None:
-        assert self.prefix_seq_len > 0
-
-        seqs[:, : self.prefix_seq_len] = self.prefix_seq
-
-        if self.prefix_seq_len == 1:
-            return
-
-        assert isinstance(self.prefix_seq, Tensor)
-
-        # We have to bootstrap the model with the already fanned-out encoder
-        # output to correctly initialize its incremental state. This causes some
-        # redundancy as we have to expand `decoder_input` to match the shape of
-        # `encoder_output`.
-        # (S_pfx) -> (N x B, S_pfx - 1)
-        decoder_input = self.prefix_seq[:-1].expand(encoder_output.size(0), -1)
-
-        # Bootstrap the model state with prefix sequence.
-        decoder_output, decoder_padding_mask = self.decoder.decode(
-            decoder_input,
-            None,
-            encoder_output,
-            encoder_padding_mask,
-            state_bag,
-        )
-
-        state_bag.increment_step(self.prefix_seq_len - 1)
-
-        model_output = self.decoder.project(decoder_output, decoder_padding_mask)
-
-        # lprobs:          (S_pfx - 1, V)
-        # model_output: (N, S_pfx - 1, V) -> (S_pfx - 1, V)
-        lprobs = log_softmax(model_output.logits[0], dim=-1, dtype=torch.float32)
-
-        # Fetch scores of next steps.
-        # (S_pfx - 1, 1)
-        prefix_scores = torch.take_along_dim(
-            lprobs, indices=self.prefix_seq[1:].unsqueeze(1), dim=-1
-        )
-
-        # (S_pfx - 1, 1) -> (S_pfx - 1)
-        prefix_scores.squeeze_(1).cumsum_(dim=0)
-
-        # First step (e.g. EOS)'s score is always 0.
-        scores[:, 1 : self.prefix_seq_len] = prefix_scores
-
-    def _finalize_hypothesis(
-        self,
-        step_nr: int,
-        eos_beam_indices: Tensor,
-        eos_scores: Tensor,
-        seqs: Tensor,
-        scores: Tensor,
-        active_searches: List[Tuple[int, List["Hypothesis"]]],
-        finished_searches: List[List["Hypothesis"]],
-    ) -> List[int]:
-        # fmt: off
-        finalized_seqs   = seqs  .index_select(dim=0, index=eos_beam_indices)
-        finalized_scores = scores.index_select(dim=0, index=eos_beam_indices)
-
-        finalized_seqs   = finalized_seqs  [:, : step_nr + 2]
-        finalized_scores = finalized_scores[:, : step_nr + 2]
-
-        # Finalize beams.
-        finalized_seqs  [:, -1] = self.eos_idx
-        finalized_scores[:, -1] = eos_scores
-        # fmt: on
-
-        # Convert from cumulative to per-step scores.
-        finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1]
-
-        # Skip first EOS since it is always 0 and skews normalization.
-        if self.opts.normalize_scores:
-            eos_scores /= (step_nr + 1) ** self.opts.len_penalty
-
-        # Holds the ids of finished searches.
-        newly_finished: List[int] = []
-
-        active_search_indices = (eos_beam_indices // self.beam_size).tolist()
-
-        for beam_idx, search_idx in enumerate(active_search_indices):
-            search_id, hypotheses = active_searches[search_idx]
-
-            # We might have more than one beam finalized in one step that would
-            # potentially exceed `beam_size` hypotheses.
-            if len(hypotheses) == self.beam_size:
-                continue
-
-            hypotheses.append(
-                Hypothesis(
-                    seq=finalized_seqs[beam_idx],
-                    score=eos_scores[beam_idx],
-                    step_scores=finalized_scores[beam_idx],
-                )
-            )
-
-            if len(hypotheses) == self.beam_size:
-                # We have `beam_size` hypotheses for this particular search, so
-                # we finish it now.
-                newly_finished.append(search_idx)
-
-                finished_searches[search_id] = hypotheses
-
-        newly_finished.sort()
-
-        # Remove finished searches from the active list.
-        for idx in reversed(newly_finished):
-            del active_searches[idx]
-
-        return newly_finished
-
-
-@dataclass
-class SequenceGeneratorOutput:
-    """Holds the output of a sequence generator."""
-
-    results: List[List["Hypothesis"]]
-    """The list of hypothesis generated per search, ordered by score."""
-
-    device: Device
-    """The device on which generated sequences reside."""
-
-    collater: Optional[Collater] = None
-    """The collater to use in :meth:`collate`."""
-
-    def collate(
-        self, hypo_idx: int = 0, skip_batch: bool = False
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        """Collate the generated sequences at index ``hypo_idx`` in each search
-        result into a single tensor.
-
-        :param hypo_idx:
-            The index of hypothesis to extract from each search result.
-        :param skip_batch:
-            If ``True``, if a search result has no hypothesis at index `hypo_idx`,
-            it will be skipped instead of raising an error.
-
-        :returns:
-          - The collated sequences. *Shape:* :math:`(N,S)`, where :math:`N` is
-            the number of search results and :math:`S` is the sequence length.
-          - An array where each element represents the length of the sequence at
-            the same index in the first returned value. *Shape:* :math:`(N)`,
-            where :math:`N` is the number of search results.
-        """
-        if self.collater is None:
-            raise RuntimeError("The output has no associated `Collater` instance.")
-
-        if not self.results and not skip_batch:
-            raise ValueError("The output must contain at least one search result.")
-
-        seqs = []
-
-        for search_idx, result in enumerate(self.results):
-            if hypo_idx >= len(result):
-                if not skip_batch:
-                    raise ValueError(
-                        f"Each search result must have at least {hypo_idx + 1} hypotheses, but search {search_idx} has only {len(result)}."
-                    )
-
-                continue
-
-            seqs.append(result[hypo_idx].seq)
-
-        if not seqs:
-            # Return a zero-dimensional (not scalar!) tensor.
-            return torch.empty((0,), device=self.device, dtype=torch.int64), None
-
-        output = cast(SequenceData, self.collater(seqs))
-
-        return output["seqs"], output["seq_lens"] if output["is_ragged"] else None
-
-
-@dataclass
-class Hypothesis:
-    """Represents a hypothesis produced by a sequence generator."""
-
-    seq: Tensor
-    """The generated sequence."""
-
-    score: Tensor
-    """The score of the hypothesis."""
-
-    step_scores: Tensor
-    """The score of each individual sequence step."""
--- a/egs/aishell/ASR/seamlessm4t/requirements.txt
+++ b/egs/aishell/ASR/seamlessm4t/requirements.txt
@ -1,6 +0,0 @@
-#k2
-kaldialign
-lhotse
-sentencepiece
-tensorboard
-fairseq2
--- a/egs/aishell/ASR/seamlessm4t/tokenizer.py
+++ b/egs/aishell/ASR/seamlessm4t/tokenizer.py
@ -1,43 +0,0 @@
-
-#import sentencepiece as spm
-
-class CharTokenizer(object):
-    def __init__(self, tokenizer_file):
-        self.id2symbol = {}
-        self.symbol2id = {}
-        with open(tokenizer_file, 'r') as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    symbol, id = line.split()
-                    id = int(id)
-                    self.id2symbol[id] = symbol
-                    self.symbol2id[symbol] = id
-        self.vocab_size = len(self.id2symbol)
-
-    def encode(self, text):
-        # if symbol not in self.symbol2id, using <unk>'s id
-        return [self.symbol2id.get(symbol, 2) for symbol in text]
-
-    def decode(self, ids):
-        return ''.join([self.id2symbol[id] for id in ids])
-
-if __name__ == '__main__':
-    # config_file = './config.yaml'
-    # config = read_yaml(config_file)
-    # converter = TokenIDConverter(config['token_list'])
-    # ids = converter.tokens2ids(['<s>', '你', '好', '吗', '</s>', 'microsoft', 'world'])
-    # print(ids)
-    # print(converter.ids2tokens(ids))
-
-
-    tokenizer = CharTokenizer('./tokens.txt')
-    ids = tokenizer.encode('今天 天气不错')
-    print(ids)
-    print(tokenizer.decode(ids+[1]))
-    # sp = spm.SentencePieceProcessor()
-    # sp.Load('../../../librispeech/ASR/k2fsa-zipformer-chinese-english-mixed/data/lang_char_bpe/bpe.model')
-    # texts = ['MICROSOFT  WORLD']
-    # y = sp.encode(texts, out_type=int)
-    # x = sp.decode(y)
-    # print(y, x)
--- a/egs/aishell/ASR/seamlessm4t/tokens.txt
+++ b/egs/aishell/ASR/seamlessm4t/tokens.txt
--- a/egs/aishell/ASR/seamlessm4t/train.py
+++ b/egs/aishell/ASR/seamlessm4t/train.py
--- a/egs/aishell/ASR/seamlessm4t/train2.py
+++ b/egs/aishell/ASR/seamlessm4t/train2.py
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -30,7 +30,7 @@ from lhotse.dataset import (
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
-    SimpleCutSampler,
+    SingleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
@ -176,13 +176,13 @@ class AishellAsrDataModule:
        group.add_argument(
            "--enable-musan",
            type=str2bool,
-            default=False,
+            default=True,
            help="When enabled, select noise from MUSAN and mix it"
            "with training dataset. ",
        )

    def train_dataloaders(
-        self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None, rank = None, world_size = None
+        self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None
    ) -> DataLoader:
        """
        Args:
@ -192,13 +192,13 @@ class AishellAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
+        cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")

        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
-            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
            transforms.append(
-                CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
            )
        else:
            logging.info("Disable MUSAN")
@ -276,12 +276,10 @@ class AishellAsrDataModule:
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                drop_last=self.args.drop_last,
-                world_size=world_size,
-                rank=rank,
            )
        else:
-            logging.info("Using SimpleCutSampler.")
-            train_sampler = SimpleCutSampler(
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
@ -302,7 +300,7 @@ class AishellAsrDataModule:

        return train_dl

-    def valid_dataloaders(self, cuts_valid: CutSet, rank = None, world_size = None) -> DataLoader:
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
@ -327,8 +325,6 @@ class AishellAsrDataModule:
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
-            rank=rank,
-            world_size=world_size,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
--- a/egs/aishell/ASR/whisper/decode.py
+++ b/egs/aishell/ASR/whisper/decode.py
@ -473,10 +473,11 @@ def main():
    aishell = AishellAsrDataModule(args)
    test_cuts = aishell.test_cuts()
    test_dl = aishell.test_dataloaders(test_cuts)
-
-    test_sets = ["test"]
-    test_dls = [test_dl]
-
+    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
+    #test_sets = ["test"]
+    #test_dls = [test_dl]
+    test_sets = ["valid"]
+    test_dls = [valid_dl]
    for test_set, test_dl in zip(test_sets, test_dls):
        results_dict = decode_dataset(
            dl=test_dl,
--- a/egs/aishell/ASR/whisper/ds_config_zero1.json
+++ b/egs/aishell/ASR/whisper/ds_config_zero1.json
@ -27,7 +27,7 @@
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 1e-5,
-            "warmup_num_steps": 1000
+            "warmup_num_steps": 100
        }
    },
    "gradient_accumulation_steps": 1,
--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@ -126,7 +126,7 @@ def get_parser():
    parser.add_argument(
        "--num-epochs",
        type=int,
-        default=5,
+        default=10,
        help="Number of epochs to train.",
    )
				`@ -1 +0,0 @@`
				`../../../librispeech/ASR/conformer_ctc/label_smoothing.py`