diff --git a/egs/aishell/ASR/decode.sh b/egs/aishell/ASR/decode.sh
deleted file mode 100644
index 31fe95ecb..000000000
--- a/egs/aishell/ASR/decode.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="2,3"
-#pip install -r seamlessm4t/requirements.txt
-#pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
-export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
-python3 seamlessm4t/decode.py --epoch 5 --exp-dir seamlessm4t/exp
-python3 seamlessm4t/decode.py --epoch 5 --avg 2 --exp-dir seamlessm4t/exp
diff --git a/egs/aishell/ASR/decode_whisper.sh b/egs/aishell/ASR/decode_whisper.sh
deleted file mode 100644
index 852359b69..000000000
--- a/egs/aishell/ASR/decode_whisper.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="1"
-#pip install -r whisper/requirements.txt
-#pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-#export PYTHONPATH=$PYTHONPATH:/mnt/samsung-t7/yuekai/asr/icefall/
-
-python3 whisper/decode.py --exp-dir whisper/exp --max-duration 100
diff --git a/egs/aishell/ASR/run.sh b/egs/aishell/ASR/run.sh
deleted file mode 100644
index 7ab85dc84..000000000
--- a/egs/aishell/ASR/run.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-
-#export CUDA_VISIBLE_DEVICES="2,3"
-pip install -r seamlessm4t/requirements.txt
-pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
-export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
-torchrun --nproc-per-node 8 seamlessm4t/train.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp_new_vocab --start-epoch 1
diff --git a/egs/aishell/ASR/run_whisper.sh b/egs/aishell/ASR/run_whisper.sh
deleted file mode 100644
index f97e44af2..000000000
--- a/egs/aishell/ASR/run_whisper.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html
-pip install -r whisper/requirements.txt
-export PYTHONPATH=$PYTHONPATH:/workspace/icefall
-#export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
-#export PYTHONPATH=$PYTHONPATH:/mnt/samsung-t7/yuekai/asr/icefall
-
-torchrun --nproc-per-node 8 whisper/train.py --use-fp16 1 --max-duration 20 --base-lr 1e-5 --exp-dir whisper/exp_medimum --start-epoch 1
diff --git a/egs/aishell/ASR/seamlessm4t/asr_datamodule.py b/egs/aishell/ASR/seamlessm4t/asr_datamodule.py
deleted file mode 120000
index fa1b8cca3..000000000
--- a/egs/aishell/ASR/seamlessm4t/asr_datamodule.py
+++ /dev/null
@@ -1 +0,0 @@
-../tdnn_lstm_ctc/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/aishell/ASR/seamlessm4t/decode.py b/egs/aishell/ASR/seamlessm4t/decode.py
deleted file mode 100755
index 43e5b9b7b..000000000
--- a/egs/aishell/ASR/seamlessm4t/decode.py
+++ /dev/null
@@ -1,415 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
-# Fangjun Kuang,
-# Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import torch
-import torch.nn as nn
-from asr_datamodule import AishellAsrDataModule
-#from conformer import Conformer
-
-from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint, average_checkpoints_with_averaged_model
-from icefall.decode import (
- get_lattice,
- nbest_decoding,
- nbest_oracle,
- one_best_decoding,
- rescore_with_attention_decoder,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
- AttributeDict,
- get_texts,
- setup_logger,
- store_transcripts,
- write_error_stats,
-)
-
-from seamless_communication.models.unity import (
- UnitYModel,
- load_unity_model,
- load_unity_text_tokenizer,
-)
-from fairseq2.generation import (
- SequenceGeneratorOptions,
- SequenceToTextGenerator,
-)
-from seamless_communication.models.unity.model import UnitYX2TModel
-
-def get_parser():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
-
- parser.add_argument(
- "--epoch",
- type=int,
- default=-1,
- help="It specifies the checkpoint to use for decoding."
- "Note: Epoch counts from 0.",
- )
- parser.add_argument(
- "--avg",
- type=int,
- default=1,
- help="Number of checkpoints to average. Automatically select "
- "consecutive checkpoints before the checkpoint specified by "
- "'--epoch'. ",
- )
-
- parser.add_argument(
- "--method",
- type=str,
- default="beam-search",
- help="""Decoding method.
- Supported values are:
- - (0) ctc-decoding. Use CTC decoding. It maps the tokens ids to
- tokens using token symbol tabel directly.
- - (1) 1best. Extract the best path from the decoding lattice as the
- decoding result.
- - (2) nbest. Extract n paths from the decoding lattice; the path
- with the highest score is the decoding result.
- - (3) attention-decoder. Extract n paths from the lattice,
- the path with the highest score is the decoding result.
- - (4) nbest-oracle. Its WER is the lower bound of any n-best
- rescoring method can achieve. Useful for debugging n-best
- rescoring method.
- """,
- )
-
- parser.add_argument(
- "--exp-dir",
- type=str,
- default="seamlessm4t/exp",
- help="The experiment dir",
- )
-
- return parser
-
-
-def get_params() -> AttributeDict:
- params = AttributeDict(
- {
- # parameters for conformer
- "subsampling_factor": 4,
- "feature_dim": 80,
- "nhead": 4,
- "attention_dim": 512,
- "num_encoder_layers": 12,
- "num_decoder_layers": 6,
- "vgg_frontend": False,
- "use_feat_batchnorm": True,
- # parameters for decoder
- "search_beam": 20,
- "output_beam": 7,
- "min_active_states": 30,
- "max_active_states": 10000,
- "use_double_scores": True,
- "env_info": get_env_info(),
- }
- )
- return params
-
-
-def decode_one_batch(
- params: AttributeDict,
- s2t_generator: SequenceToTextGenerator,
- batch: dict,
-) -> Dict[str, List[List[int]]]:
- """Decode one batch and return the result in a dict. The dict has the
- following format:
-
- - key: It indicates the setting used for decoding. For example,
- if decoding method is 1best, the key is the string `no_rescore`.
- If attention rescoring is used, the key is the string
- `ngram_lm_scale_xxx_attention_scale_xxx`, where `xxx` is the
- value of `lm_scale` and `attention_scale`. An example key is
- `ngram_lm_scale_0.7_attention_scale_0.5`
- - value: It contains the decoding result. `len(value)` equals to
- batch size. `value[i]` is the decoding result for the i-th
- utterance in the given batch.
- Args:
- params:
- It's the return value of :func:`get_params`.
-
- - params.method is "1best", it uses 1best decoding without LM rescoring.
- - params.method is "nbest", it uses nbest decoding without LM rescoring.
- - params.method is "attention-decoder", it uses attention rescoring.
-
- model:
- The neural model.
- HLG:
- The decoding graph. Used when params.method is NOT ctc-decoding.
- H:
- The ctc topo. Used only when params.method is ctc-decoding.
- batch:
- It is the return value from iterating
- `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
- for the format of the `batch`.
- lexicon:
- It contains the token symbol table and the word symbol table.
- sos_id:
- The token ID of the SOS.
- eos_id:
- The token ID of the EOS.
- Returns:
- Return the decoding result. See above description for the format of
- the returned dict.
- """
- dtype = torch.float16
- device = torch.device("cuda", 3)
-
- feature = batch["inputs"]
- assert feature.ndim == 3
- feature = feature.to(device, dtype=dtype)
- # at entry, feature is (N, T, C)
-
- supervisions = batch["supervisions"]
- feature_len = supervisions["num_frames"]
- feature_len = feature_len.to(device, dtype=dtype)
-
- text_output = s2t_generator.generate_ex(feature, feature_len)
- sentences = text_output.sentences
- hyps = [sentence.bytes().decode("utf-8").split() for sentence in sentences]
- key = "beam-search"
-
- return {key: hyps}
-
-
-def decode_dataset(
- dl: torch.utils.data.DataLoader,
- params: AttributeDict,
- s2t_generator: SequenceToTextGenerator,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
- """Decode dataset.
-
- Args:
- dl:
- PyTorch's dataloader containing the dataset to decode.
- params:
- It is returned by :func:`get_params`.
- model:
- The neural model.
- HLG:
- The decoding graph. Used when params.method is NOT ctc-decoding.
- H:
- The ctc topo. Used only when params.method is ctc-decoding.
- lexicon:
- It contains the token symbol table and the word symbol table.
- sos_id:
- The token ID for SOS.
- eos_id:
- The token ID for EOS.
- Returns:
- Return a dict, whose key may be "no-rescore" if the decoding method is
- 1best or it may be "ngram_lm_scale_0.7_attention_scale_0.5" if attention
- rescoring is used. Its value is a list of tuples. Each tuple contains two
- elements: The first is the reference transcript, and the second is the
- predicted result.
- """
- results = []
-
- num_cuts = 0
-
- try:
- num_batches = len(dl)
- except TypeError:
- num_batches = "?"
-
- results = defaultdict(list)
- for batch_idx, batch in enumerate(dl):
- texts = batch["supervisions"]["text"]
- cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-
- hyps_dict = decode_one_batch(
- params=params,
- s2t_generator=s2t_generator,
- batch=batch,
- )
-
- for lm_scale, hyps in hyps_dict.items():
- this_batch = []
- assert len(hyps) == len(texts)
- for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
- ref_words = ref_text.split()
- this_batch.append((cut_id, ref_words, hyp_words))
-
- results[lm_scale].extend(this_batch)
-
- num_cuts += len(batch["supervisions"]["text"])
-
- if batch_idx % 100 == 0:
- batch_str = f"{batch_idx}/{num_batches}"
-
- logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
- return results
-
-
-def save_results(
- params: AttributeDict,
- test_set_name: str,
- results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-
- enable_log = True
- test_set_wers = dict()
- for key, results in results_dict.items():
- recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
- results = sorted(results)
- store_transcripts(filename=recog_path, texts=results)
- if enable_log:
- logging.info(f"The transcripts are stored in {recog_path}")
-
- # The following prints out WERs, per-word error statistics and aligned
- # ref/hyp pairs.
- errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
- # we compute CER for aishell dataset.
- results_char = []
- for res in results:
- results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
- with open(errs_filename, "w") as f:
- wer = write_error_stats(
- f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
- )
- test_set_wers[key] = wer
-
- if enable_log:
- logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
- test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
- errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
- with open(errs_info, "w") as f:
- print("settings\tCER", file=f)
- for key, val in test_set_wers:
- print("{}\t{}".format(key, val), file=f)
-
- s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
- note = "\tbest for {}".format(test_set_name)
- for key, val in test_set_wers:
- s += "{}\t{}{}\n".format(key, val, note)
- note = ""
- logging.info(s)
-
-
-@torch.no_grad()
-def main():
- parser = get_parser()
- AishellAsrDataModule.add_arguments(parser)
- args = parser.parse_args()
- args.exp_dir = Path(args.exp_dir)
-
- params = get_params()
- params.update(vars(args))
- params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
- setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}")
- logging.info("Decoding started")
- logging.info(params)
-
- device = torch.device("cpu")
- if torch.cuda.is_available():
- device = torch.device("cuda", 3)
-
- logging.info(f"device: {device}")
- dtype = torch.float16
-
- model_name_or_card = "seamlessM4T_medium"
- #model_name_or_card = "seamlessM4T_large"
- model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
- del model.t2u_model
- del model.text_encoder
- del model.text_encoder_frontend
- if params.epoch > 0:
- if params.avg > 1:
- start = params.epoch - params.avg
- assert start >= 1, start
- filename_start = f"{params.exp_dir}/epoch-{start}.pt"
- filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
- logging.info(
- f"Calculating the averaged model over epoch range from "
- f"{start} (excluded) to {params.epoch}"
- )
- model.to(device)
- model.load_state_dict(
- average_checkpoints_with_averaged_model(
- filename_start=filename_start,
- filename_end=filename_end,
- device=device,
- )
- )
- else:
- load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
- model.to(device)
- model.eval()
- num_param = sum([p.numel() for p in model.parameters()])
- logging.info(f"Number of model parameters: {num_param}")
-
- text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-
- text_max_len_a = 1
- text_max_len_b = 200
- target_lang = "cmn"
-
- text_opts = SequenceGeneratorOptions(
- beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
- )
-
- s2t_model = UnitYX2TModel(
- encoder_frontend=model.speech_encoder_frontend,
- encoder=model.speech_encoder,
- decoder_frontend=model.text_decoder_frontend,
- decoder=model.text_decoder,
- final_proj=model.final_proj,
- pad_idx=model.pad_idx,
- )
- s2t_generator = SequenceToTextGenerator(
- s2t_model, text_tokenizer, target_lang, text_opts
- )
- # we need cut ids to display recognition results.
- args.return_cuts = True
- aishell = AishellAsrDataModule(args)
- test_cuts = aishell.test_cuts()
- test_dl = aishell.test_dataloaders(test_cuts)
-
- test_sets = ["test"]
- test_dls = [test_dl]
-
- for test_set, test_dl in zip(test_sets, test_dls):
- results_dict = decode_dataset(
- dl=test_dl,
- params=params,
- s2t_generator=s2t_generator,
- )
-
- save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
- logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
- main()
diff --git a/egs/aishell/ASR/seamlessm4t/decode2.py b/egs/aishell/ASR/seamlessm4t/decode2.py
deleted file mode 100644
index 4607bae07..000000000
--- a/egs/aishell/ASR/seamlessm4t/decode2.py
+++ /dev/null
@@ -1,432 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
-# Fangjun Kuang,
-# Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import k2
-import torch
-import torch.nn as nn
-from asr_datamodule import AishellAsrDataModule
-#from conformer import Conformer
-from tokenizer import CharTokenizer
-from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import average_checkpoints, load_checkpoint, average_checkpoints_with_averaged_model
-from icefall.decode import (
- get_lattice,
- nbest_decoding,
- nbest_oracle,
- one_best_decoding,
- rescore_with_attention_decoder,
-)
-from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import (
- AttributeDict,
- get_texts,
- setup_logger,
- store_transcripts,
- write_error_stats,
-)
-
-from seamless_communication.models.unity import (
- UnitYModel,
- load_unity_model,
- load_unity_text_tokenizer,
-)
-from fairseq2.generation import (
- SequenceGeneratorOptions,
- SequenceToTextGenerator,
-)
-from seamless_communication.models.unity.model import UnitYX2TModel
-from fairseq2.nn.embedding import Embedding
-def get_parser():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
-
- parser.add_argument(
- "--epoch",
- type=int,
- default=-1,
- help="It specifies the checkpoint to use for decoding."
- "Note: Epoch counts from 0.",
- )
- parser.add_argument(
- "--avg",
- type=int,
- default=1,
- help="Number of checkpoints to average. Automatically select "
- "consecutive checkpoints before the checkpoint specified by "
- "'--epoch'. ",
- )
-
- parser.add_argument(
- "--method",
- type=str,
- default="beam-search",
- help="""Decoding method.
- Supported values are:
- - (0) ctc-decoding. Use CTC decoding. It maps the tokens ids to
- tokens using token symbol tabel directly.
- - (1) 1best. Extract the best path from the decoding lattice as the
- decoding result.
- - (2) nbest. Extract n paths from the decoding lattice; the path
- with the highest score is the decoding result.
- - (3) attention-decoder. Extract n paths from the lattice,
- the path with the highest score is the decoding result.
- - (4) nbest-oracle. Its WER is the lower bound of any n-best
- rescoring method can achieve. Useful for debugging n-best
- rescoring method.
- """,
- )
-
- parser.add_argument(
- "--exp-dir",
- type=str,
- default="seamlessm4t/exp",
- help="The experiment dir",
- )
-
- return parser
-
-
-def get_params() -> AttributeDict:
- params = AttributeDict(
- {
- # parameters for conformer
- "subsampling_factor": 4,
- "feature_dim": 80,
- "nhead": 4,
- "attention_dim": 512,
- "num_encoder_layers": 12,
- "num_decoder_layers": 6,
- "vgg_frontend": False,
- "use_feat_batchnorm": True,
- # parameters for decoder
- "search_beam": 20,
- "output_beam": 7,
- "min_active_states": 30,
- "max_active_states": 10000,
- "use_double_scores": True,
- "env_info": get_env_info(),
- }
- )
- return params
-
-
-def decode_one_batch(
- params: AttributeDict,
- s2t_generator: SequenceToTextGenerator,
- batch: dict,
-) -> Dict[str, List[List[int]]]:
- """Decode one batch and return the result in a dict. The dict has the
- following format:
-
- - key: It indicates the setting used for decoding. For example,
- if decoding method is 1best, the key is the string `no_rescore`.
- If attention rescoring is used, the key is the string
- `ngram_lm_scale_xxx_attention_scale_xxx`, where `xxx` is the
- value of `lm_scale` and `attention_scale`. An example key is
- `ngram_lm_scale_0.7_attention_scale_0.5`
- - value: It contains the decoding result. `len(value)` equals to
- batch size. `value[i]` is the decoding result for the i-th
- utterance in the given batch.
- Args:
- params:
- It's the return value of :func:`get_params`.
-
- - params.method is "1best", it uses 1best decoding without LM rescoring.
- - params.method is "nbest", it uses nbest decoding without LM rescoring.
- - params.method is "attention-decoder", it uses attention rescoring.
-
- model:
- The neural model.
- HLG:
- The decoding graph. Used when params.method is NOT ctc-decoding.
- H:
- The ctc topo. Used only when params.method is ctc-decoding.
- batch:
- It is the return value from iterating
- `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
- for the format of the `batch`.
- lexicon:
- It contains the token symbol table and the word symbol table.
- sos_id:
- The token ID of the SOS.
- eos_id:
- The token ID of the EOS.
- Returns:
- Return the decoding result. See above description for the format of
- the returned dict.
- """
- dtype = torch.float16
- device = torch.device("cuda", 3)
-
- feature = batch["inputs"]
- assert feature.ndim == 3
- feature = feature.to(device, dtype=dtype)
- # at entry, feature is (N, T, C)
-
- supervisions = batch["supervisions"]
- feature_len = supervisions["num_frames"]
- feature_len = feature_len.to(device, dtype=dtype)
-
- text_output = s2t_generator.generate_ex(feature, feature_len)
- #sentences = text_output.sentences
- #hyps = [sentence.bytes().decode("utf-8").split() for sentence in sentences]
-
- token_ids = text_output.generator_output.results
- hyps_ids = [sentence[0].seq.cpu().tolist() for sentence in token_ids]
- hyps = [params.tokenizer.decode(hyps_id).split() for hyps_id in hyps_ids]
-
- key = "beam-search"
-
- return {key: hyps}
-
-
-def decode_dataset(
- dl: torch.utils.data.DataLoader,
- params: AttributeDict,
- s2t_generator: SequenceToTextGenerator,
-) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
- """Decode dataset.
-
- Args:
- dl:
- PyTorch's dataloader containing the dataset to decode.
- params:
- It is returned by :func:`get_params`.
- model:
- The neural model.
- HLG:
- The decoding graph. Used when params.method is NOT ctc-decoding.
- H:
- The ctc topo. Used only when params.method is ctc-decoding.
- lexicon:
- It contains the token symbol table and the word symbol table.
- sos_id:
- The token ID for SOS.
- eos_id:
- The token ID for EOS.
- Returns:
- Return a dict, whose key may be "no-rescore" if the decoding method is
- 1best or it may be "ngram_lm_scale_0.7_attention_scale_0.5" if attention
- rescoring is used. Its value is a list of tuples. Each tuple contains two
- elements: The first is the reference transcript, and the second is the
- predicted result.
- """
- results = []
-
- num_cuts = 0
-
- try:
- num_batches = len(dl)
- except TypeError:
- num_batches = "?"
-
- results = defaultdict(list)
- for batch_idx, batch in enumerate(dl):
- texts = batch["supervisions"]["text"]
- cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-
- hyps_dict = decode_one_batch(
- params=params,
- s2t_generator=s2t_generator,
- batch=batch,
- )
-
- for lm_scale, hyps in hyps_dict.items():
- this_batch = []
- assert len(hyps) == len(texts)
- for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
- ref_words = ref_text.split()
- this_batch.append((cut_id, ref_words, hyp_words))
-
- results[lm_scale].extend(this_batch)
-
- num_cuts += len(batch["supervisions"]["text"])
-
- if batch_idx % 100 == 0:
- batch_str = f"{batch_idx}/{num_batches}"
-
- logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
- return results
-
-
-def save_results(
- params: AttributeDict,
- test_set_name: str,
- results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
-):
-
- enable_log = True
- test_set_wers = dict()
- for key, results in results_dict.items():
- recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
- results = sorted(results)
- store_transcripts(filename=recog_path, texts=results)
- if enable_log:
- logging.info(f"The transcripts are stored in {recog_path}")
-
- # The following prints out WERs, per-word error statistics and aligned
- # ref/hyp pairs.
- errs_filename = params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
- # we compute CER for aishell dataset.
- results_char = []
- for res in results:
- results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
- with open(errs_filename, "w") as f:
- wer = write_error_stats(
- f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
- )
- test_set_wers[key] = wer
-
- if enable_log:
- logging.info("Wrote detailed error stats to {}".format(errs_filename))
-
- test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
- errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
- with open(errs_info, "w") as f:
- print("settings\tCER", file=f)
- for key, val in test_set_wers:
- print("{}\t{}".format(key, val), file=f)
-
- s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
- note = "\tbest for {}".format(test_set_name)
- for key, val in test_set_wers:
- s += "{}\t{}{}\n".format(key, val, note)
- note = ""
- logging.info(s)
-
-
-@torch.no_grad()
-def main():
- parser = get_parser()
- AishellAsrDataModule.add_arguments(parser)
- args = parser.parse_args()
- args.exp_dir = Path(args.exp_dir)
-
- params = get_params()
- params.tokenizer = CharTokenizer('./seamlessm4t/tokens.txt')
- params.update(vars(args))
- params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
- setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}")
- logging.info("Decoding started")
- logging.info(params)
-
- device = torch.device("cpu")
- if torch.cuda.is_available():
- device = torch.device("cuda", 3)
-
- logging.info(f"device: {device}")
- dtype = torch.float16
-
- model_name_or_card = "seamlessM4T_medium"
- #model_name_or_card = "seamlessM4T_large"
- model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
- del model.t2u_model
- del model.text_encoder
- del model.text_encoder_frontend
- model.text_decoder_frontend.embed = nn.Embedding(num_embeddings=params.tokenizer.vocab_size, embedding_dim=1024 ,padding_idx=0)
- #model.text_decoder_frontend.embed = Embedding(num_embeddings=params.tokenizer.vocab_size, embedding_dim=1024 ,pad_idx=0, scaled=True)
- model.final_proj = nn.Linear(1024, params.tokenizer.vocab_size, bias=False)
- #model.final_proj = nn.Linear(1024, params.tokenizer.vocab_size)
- if params.epoch > 0:
- if params.avg > 1:
- start = params.epoch - params.avg
- assert start >= 1, start
- filename_start = f"{params.exp_dir}/epoch-{start}.pt"
- filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
- logging.info(
- f"Calculating the averaged model over epoch range from "
- f"{start} (excluded) to {params.epoch}"
- )
- model.to(device)
- model.load_state_dict(
- average_checkpoints_with_averaged_model(
- filename_start=filename_start,
- filename_end=filename_end,
- device=device,
- )
- )
- else:
- load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
- model.to(device)
- model.eval()
- model.half()
- #for param in model.parameters():
- # if param.dtype == torch.float16:
- # pass
- # else:
- # param.data = param.data.to(torch.float16)
- #print(param)
- num_param = sum([p.numel() for p in model.parameters()])
- logging.info(f"Number of model parameters: {num_param}")
-
- text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-
- text_max_len_a = 1
- text_max_len_b = 200
- target_lang = "cmn"
-
- text_opts = SequenceGeneratorOptions(
- beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
- )
-
- s2t_model = UnitYX2TModel(
- encoder_frontend=model.speech_encoder_frontend,
- encoder=model.speech_encoder,
- decoder_frontend=model.text_decoder_frontend,
- decoder=model.text_decoder,
- final_proj=model.final_proj,
- pad_idx=model.pad_idx,
- )
- s2t_generator = SequenceToTextGenerator(
- s2t_model, text_tokenizer, target_lang, text_opts
- )
- # we need cut ids to display recognition results.
- args.return_cuts = True
- aishell = AishellAsrDataModule(args)
- test_cuts = aishell.test_cuts()
- test_dl = aishell.test_dataloaders(test_cuts)
-
- test_sets = ["test"]
- test_dls = [test_dl]
-
- for test_set, test_dl in zip(test_sets, test_dls):
- results_dict = decode_dataset(
- dl=test_dl,
- params=params,
- s2t_generator=s2t_generator,
- )
-
- save_results(params=params, test_set_name=test_set, results_dict=results_dict)
-
- logging.info("Done!")
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
- main()
diff --git a/egs/aishell/ASR/seamlessm4t/label_smoothing.py b/egs/aishell/ASR/seamlessm4t/label_smoothing.py
deleted file mode 120000
index e9d239fff..000000000
--- a/egs/aishell/ASR/seamlessm4t/label_smoothing.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/conformer_ctc/label_smoothing.py
\ No newline at end of file
diff --git a/egs/aishell/ASR/seamlessm4t/model.py b/egs/aishell/ASR/seamlessm4t/model.py
deleted file mode 100644
index efe18d5ff..000000000
--- a/egs/aishell/ASR/seamlessm4t/model.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import torch
-import torch.nn as nn
-from fairseq2.nn.embedding import Embedding
-from seamless_communication.models.inference import Translator
-from seamless_communication.models.unity import (
- UnitTokenizer,
- UnitYModel,
- load_unity_model,
- load_unity_text_tokenizer,
- load_unity_unit_tokenizer,
-)
-from fairseq2.generation import (
- Seq2SeqGenerator,
- SequenceGeneratorOptions,
- SequenceGeneratorOutput,
- SequenceToTextGenerator,
- SequenceToTextOutput,
-)
-from seamless_communication.models.unity.model import UnitYModel, UnitYX2TModel
-
-import torchaudio
-import torchaudio.compliance.kaldi as ta_kaldi
-audio_file="/mnt/samsung-t7/yuekai/asr/Triton-ASR-Client/datasets/mini_en/wav/1089-134686-0001.wav"
-src_lang="cmn"
-
-audio_file="/mnt/samsung-t7/yuekai/asr/Triton-ASR-Client/datasets/mini_zh/wav/long.wav"
-src_lang="eng"
-target_lang = "cmn"
-
-audio_input = torchaudio.load(audio_file)[0]
-feature = ta_kaldi.fbank(audio_input, num_mel_bins=80)
-# feature shape is (T, F), convert it to (B, T, F), source_seq_lens tracks T
-source_seqs = feature.unsqueeze(0)
-source_seq_lens = torch.tensor([feature.shape[0]])
-
-# Initialize a Translator object with a multitask model, vocoder on the GPU.
-
-
-# translator = Translator("seamlessM4T_medium", vocoder_name_or_card="vocoder_36langs", device=torch.device("cuda:2"), dtype=torch.float16)
-
-# transcribed_text, _, _ = translator.predict(audio_file, "asr", src_lang)
-
-# print(transcribed_text)
-
-
-model_name_or_card = "seamlessM4T_medium"
-device = torch.device("cuda:3")
-
-# cast source_seq_lens, source_seqs to device, dtype to torch.float16
-source_seq_lens = source_seq_lens.to(device=device, dtype=torch.float16)
-source_seqs = source_seqs.to(device=device, dtype=torch.float16)
-
-
-
-dtype = torch.float16
-model = load_unity_model(model_name_or_card, device=device, dtype=dtype)
-model.eval()
-model.text_decoder_frontend.embed = Embedding(num_embeddings=6257, embedding_dim=1024 ,pad_idx=0, scaled=True)
-model.final_proj = nn.Linear(1024, 6257)
-model.half()
-print(model.text_decoder_frontend.embed, model.text_encoder_frontend.embed.weight.dtype, type(model.text_encoder_frontend.embed), type(model.text_encoder_frontend.embed.weight))
-print(model.final_proj, model.final_proj.weight.dtype, type(model.final_proj), type(model.final_proj.weight))
-#input()
-exit(0)
-text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
-#print(text_tokenizer.model.eos_idx, text_tokenizer.model.pad_idx)
-#text_tokenizer_encoder = text_tokenizer.create_encoder(lang=target_lang, mode="target")
-#text_tokenizer_decoder = text_tokenizer.create_decoder()
-# print attritbut of text_tokenizer_encoder
-#print(text_tokenizer.vocab_info)
-#print(text_tokenizer_encoder("其中广州深圳甚至出现了多个日光盘"))
-#print(text_tokenizer_decoder(torch.tensor([3,256200,137139,252603,250476,250590,1,84778,148897,249568,249352,249947,249050,250520,254508])))
-
-# store all vocab in a file
-# with open("vocab.txt", "w") as f:
-# for i in range(256206):
-# f.write(f"{i}: " + text_tokenizer_decoder(torch.tensor([i]))[0].bytes().decode("utf-8")+ "\n")
-# f.close()
-# exit(0)
-
-
-
-# def decode(
-# self,
-# seqs: Tensor,
-# seq_lens: Optional[Tensor],
-# encoder_output: Tensor,
-# encoder_padding_mask: Optional[Tensor],
-# state_bag: Optional[IncrementalStateBag] = None,
-# ) -> Tuple[Tensor, Optional[Tensor]]:
-# seqs, padding_mask = self.text_decoder_frontend(seqs, seq_lens, state_bag)
-
-# return self.text_decoder( # type: ignore[no-any-return]
-# seqs, padding_mask, encoder_output, encoder_padding_mask, state_bag
-# )
-
-# def decoding(model, feature):
-# seqs, padding_mask = model.speech_encoder_frontend(seqs, seq_lens)
-# speech_encoder(seqs, padding_mask)
-
-# decoder_output, decoder_padding_mask = self.decode(
-# batch.target_seqs,
-# batch.target_seq_lens,
-# encoder_output,
-# encoder_padding_mask,
-# )
-
-# text_logits = model.final_project(decoder_output, decoder_padding_mask)
-
-text_max_len_a = 1
-text_max_len_b = 200
-
-text_opts = SequenceGeneratorOptions(
- beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
-)
-
-s2t_model = UnitYX2TModel(
- encoder_frontend=model.speech_encoder_frontend,
- encoder=model.speech_encoder,
- decoder_frontend=model.text_decoder_frontend,
- decoder=model.text_decoder,
- final_proj=model.final_proj,
- pad_idx=model.pad_idx,
-)
-s2t_generator = SequenceToTextGenerator(
- s2t_model, text_tokenizer, target_lang, text_opts
-)
-
-text_output = s2t_generator.generate_ex(source_seqs, source_seq_lens)
-print(text_output.generator_output.results[0][0].seq.cpu().tolist())
-# sentence = text_output.sentences[0]
-# print(sentence, type(sentence))
-# sentence = sentence.bytes().decode("utf-8")
diff --git a/egs/aishell/ASR/seamlessm4t/optim.py b/egs/aishell/ASR/seamlessm4t/optim.py
deleted file mode 100644
index abfb2092c..000000000
--- a/egs/aishell/ASR/seamlessm4t/optim.py
+++ /dev/null
@@ -1,1173 +0,0 @@
-# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import logging
-import random
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-from lhotse.utils import fix_random_seed
-from torch import Tensor
-from torch.optim import Optimizer
-
-
-class BatchedOptimizer(Optimizer):
- """
- This class adds to class Optimizer the capability to optimize parameters in batches:
- it will stack the parameters and their grads for you so the optimizer can work
- on tensors with an extra leading dimension. This is intended for speed with GPUs,
- as it reduces the number of kernels launched in the optimizer.
-
- Args:
- params:
- """
-
- def __init__(self, params, defaults):
- super(BatchedOptimizer, self).__init__(params, defaults)
-
- @contextlib.contextmanager
- def batched_params(self, param_group, group_params_names):
- """
- This function returns (technically, yields) a list of
- of tuples (p, state), where
- p is a `fake` parameter that is stacked (over axis 0) from real parameters
- that share the same shape, and its gradient is also stacked;
- `state` is the state corresponding to this batch of parameters
- (it will be physically located in the "state" for one of the real
- parameters, the last one that has any particular shape and dtype).
-
- This function is decorated as a context manager so that it can
- write parameters back to their "real" locations.
-
- The idea is, instead of doing:
-
- for p in group["params"]:
- state = self.state[p]
- ...
-
- you can do:
-
- with self.batched_params(group["params"]) as batches:
- for p, state, p_names in batches:
- ...
-
-
- Args:
- group: a parameter group, which is a list of parameters; should be
- one of self.param_groups.
- group_params_names: name for each parameter in group,
- which is List[str].
- """
- batches = defaultdict(
- list
- ) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
- batches_names = defaultdict(
- list
- ) # `batches` maps from tuple (dtype_as_str,*shape) to list of str
-
- assert len(param_group) == len(group_params_names)
- for p, named_p in zip(param_group, group_params_names):
- key = (str(p.dtype), *p.shape)
- batches[key].append(p)
- batches_names[key].append(named_p)
-
- batches_names_keys = list(batches_names.keys())
- sorted_idx = sorted(
- range(len(batches_names)), key=lambda i: batches_names_keys[i]
- )
- batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
- batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
-
- stacked_params_dict = dict()
-
- # turn batches into a list, in deterministic order.
- # tuples will contain tuples of (stacked_param, state, stacked_params_names),
- # one for each batch in `batches`.
- tuples = []
-
- for batch, batch_names in zip(batches, batches_names):
- p = batch[0]
- # we arbitrarily store the state in the
- # state corresponding to the 1st parameter in the
- # group. class Optimizer will take care of saving/loading state.
- state = self.state[p]
- p_stacked = torch.stack(batch)
- grad = torch.stack(
- [torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
- )
- p_stacked.grad = grad
- stacked_params_dict[key] = p_stacked
- tuples.append((p_stacked, state, batch_names))
-
- yield tuples # <-- calling code will do the actual optimization here!
-
- for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
- for i, p in enumerate(batch): # batch is list of Parameter
- p.copy_(stacked_params[i])
-
-
-class ScaledAdam(BatchedOptimizer):
- """
- Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
- proportional to the norm of that parameter; and also learn the scale of the parameter,
- in log space, subject to upper and lower limits (as if we had factored each parameter as
- param = underlying_param * log_scale.exp())
-
-
- Args:
- params: The parameters or param_groups to optimize (like other Optimizer subclasses)
- Unlike common optimizers, which accept model.parameters() or groups of parameters(),
- this optimizer could accept model.named_parameters() or groups of named_parameters().
- See comments of function _get_names_of_parameters for its 4 possible cases.
- lr: The learning rate. We will typically use a learning rate schedule that starts
- at 0.03 and decreases over time, i.e. much higher than other common
- optimizers.
- clipping_scale: (e.g. 2.0)
- A scale for gradient-clipping: if specified, the normalized gradients
- over the whole model will be clipped to have 2-norm equal to
- `clipping_scale` times the median 2-norm over the most recent period
- of `clipping_update_period` minibatches. By "normalized gradients",
- we mean after multiplying by the rms parameter value for this tensor
- [for non-scalars]; this is appropriate because our update is scaled
- by this quantity.
- betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
- Must satisfy 0 < beta <= beta2 < 1.
- scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
- scale of each parameter tensor and scalar parameters of the mode..
- If each parameter were decomposed
- as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
- would be a the scaling factor on the learning rate of p_scale.
- eps: A general-purpose epsilon to prevent division by zero
- param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
- learning the scale on the parameters (we'll constrain the rms of each non-scalar
- parameter tensor to be >= this value)
- param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
- learning the scale on the parameters (we'll constrain the rms of each non-scalar
- parameter tensor to be <= this value)
- scalar_max: Maximum absolute value for scalar parameters (applicable if your
- model has any parameters with numel() == 1).
- size_update_period: The periodicity, in steps, with which we update the size (scale)
- of the parameter tensor. This is provided to save a little time
- in the update.
- clipping_update_period: if clipping_scale is specified, this is the period
- """
-
- def __init__(
- self,
- params,
- lr=3e-02,
- clipping_scale=None,
- betas=(0.9, 0.98),
- scalar_lr_scale=0.1,
- eps=1.0e-08,
- param_min_rms=1.0e-05,
- param_max_rms=3.0,
- scalar_max=10.0,
- size_update_period=4,
- clipping_update_period=100,
- ):
-
- defaults = dict(
- lr=lr,
- clipping_scale=clipping_scale,
- betas=betas,
- scalar_lr_scale=scalar_lr_scale,
- eps=eps,
- param_min_rms=param_min_rms,
- param_max_rms=param_max_rms,
- scalar_max=scalar_max,
- size_update_period=size_update_period,
- clipping_update_period=clipping_update_period,
- )
-
- # If params only contains parameters or group of parameters,
- # i.e when parameter names are not given,
- # this flag will be set to False in funciton _get_names_of_parameters.
- self.show_dominant_parameters = True
- param_groups, parameters_names = self._get_names_of_parameters(params)
- super(ScaledAdam, self).__init__(param_groups, defaults)
- assert len(self.param_groups) == len(parameters_names)
- self.parameters_names = parameters_names
-
- def _get_names_of_parameters(
- self, params_or_named_params
- ) -> Tuple[List[Dict], List[List[str]]]:
- """
- Args:
- params_or_named_params: according to the way ScaledAdam is initialized in train.py,
- this argument could be one of following 4 cases,
- case 1, a generator of parameter, e.g.:
- optimizer = ScaledAdam(model.parameters(), lr=params.base_lr, clipping_scale=3.0)
-
- case 2, a list of parameter groups with different config, e.g.:
- model_param_groups = [
- {'params': model.encoder.parameters(), 'lr': 0.05},
- {'params': model.decoder.parameters(), 'lr': 0.01},
- {'params': model.joiner.parameters(), 'lr': 0.03},
- ]
- optimizer = ScaledAdam(model_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
- case 3, a generator of named_parameter, e.g.:
- optimizer = ScaledAdam(model.named_parameters(), lr=params.base_lr, clipping_scale=3.0)
-
- case 4, a list of named_parameter groups with different config, e.g.:
- model_named_param_groups = [
- {'named_params': model.encoder.named_parameters(), 'lr': 0.05},
- {'named_params': model.decoder.named_parameters(), 'lr': 0.01},
- {'named_params': model.joiner.named_parameters(), 'lr': 0.03},
- ]
- optimizer = ScaledAdam(model_named_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
- For case 1 and case 2, input params is used to initialize the underlying torch.optimizer.
- For case 3 and case 4, firstly, names and params are extracted from input named_params,
- then, these extracted params are used to initialize the underlying torch.optimizer,
- and these extracted names are mainly used by function
- `_show_gradient_dominating_parameter`
-
- Returns:
- Returns a tuple containing 2 elements:
- - `param_groups` with type List[Dict], each Dict element is a parameter group.
- An example of `param_groups` could be:
- [
- {'params': `one iterable of Parameter`, 'lr': 0.05},
- {'params': `another iterable of Parameter`, 'lr': 0.08},
- {'params': `a third iterable of Parameter`, 'lr': 0.1},
- ]
- - `param_gruops_names` with type List[List[str]],
- each `List[str]` is for a group['params'] in param_groups,
- and each `str` is the name of a parameter.
- A dummy name "foo" is related to each parameter,
- if input are params without names, i.e. case 1 or case 2.
- """
- # variable naming convention in this function:
- # p is short for param.
- # np is short for named_param.
- # p_or_np is short for param_or_named_param.
- # cur is short for current.
- # group is a dict, e.g. {'params': iterable of parameter, 'lr': 0.05, other fields}.
- # groups is a List[group]
-
- iterable_or_groups = list(params_or_named_params)
- if len(iterable_or_groups) == 0:
- raise ValueError("optimizer got an empty parameter list")
-
- # The first value of returned tuple. A list of dicts containing at
- # least 'params' as a key.
- param_groups = []
-
- # The second value of returned tuple,
- # a List[List[str]], each sub-List is for a group.
- param_groups_names = []
-
- if not isinstance(iterable_or_groups[0], dict):
- # case 1 or case 3,
- # the input is an iterable of parameter or named parameter.
- param_iterable_cur_group = []
- param_names_cur_group = []
- for p_or_np in iterable_or_groups:
- if isinstance(p_or_np, tuple):
- # case 3
- name, param = p_or_np
- else:
- # case 1
- assert isinstance(p_or_np, torch.Tensor)
- param = p_or_np
- # Assign a dummy name as a placeholder
- name = "foo"
- self.show_dominant_parameters = False
- param_iterable_cur_group.append(param)
- param_names_cur_group.append(name)
- param_groups.append({"params": param_iterable_cur_group})
- param_groups_names.append(param_names_cur_group)
- else:
- # case 2 or case 4
- # the input is groups of parameter or named parameter.
- for cur_group in iterable_or_groups:
- assert "named_params" in cur_group
- name_list = [ x[0] for x in cur_group["named_params"] ]
- p_list = [ x[1] for x in cur_group["named_params"] ]
- del cur_group["named_params"]
- cur_group["params"] = p_list
- param_groups.append(cur_group)
- param_groups_names.append(name_list)
-
- return param_groups, param_groups_names
-
- def __setstate__(self, state):
- super(ScaledAdam, self).__setstate__(state)
-
- @torch.no_grad()
- def step(self, closure=None):
- """Performs a single optimization step.
-
- Arguments:
- closure (callable, optional): A closure that reevaluates the model
- and returns the loss.
- """
- loss = None
- if closure is not None:
- with torch.enable_grad():
- loss = closure()
-
- batch = True
-
- for group, group_params_names in zip(self.param_groups, self.parameters_names):
-
- with self.batched_params(group["params"], group_params_names) as batches:
-
- # batches is list of pairs (stacked_param, state). stacked_param is like
- # a regular parameter, and will have a .grad, but the 1st dim corresponds to
- # a stacking dim, it is not a real dim.
-
- if (
- len(batches[0][1]) == 0
- ): # if len(first state) == 0: not yet initialized
- clipping_scale = 1
- else:
- clipping_scale = self._get_clipping_scale(group, batches)
-
- for p, state, _ in batches:
- # Perform optimization step.
- # grad is not going to be None, we handled that when creating the batches.
- grad = p.grad
- if grad.is_sparse:
- raise RuntimeError(
- "ScaledAdam optimizer does not support sparse gradients"
- )
- # State initialization
- if len(state) == 0:
- self._init_state(group, p, state)
-
- self._step_one_batch(group, p, state, clipping_scale)
-
- return loss
-
- def _init_state(self, group: dict, p: Tensor, state: dict):
- """
- Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p
- is actually the batch dimension, corresponding to batched-together
- parameters of a given shape.
-
-
- Args:
- group: Dict to look up configuration values.
- p: The parameter that we are initializing the state for
- state: Dict from string to whatever state we are initializing
- """
- size_update_period = group["size_update_period"]
-
- state["step"] = 0
-
- kwargs = {"device": p.device, "dtype": p.dtype}
-
- # 'delta' implements conventional momentum. There are
- # several different kinds of update going on, so rather than
- # compute "exp_avg" like in Adam, we store and decay a
- # parameter-change "delta", which combines all forms of
- # update. this is equivalent to how it's done in Adam,
- # except for the first few steps.
- state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
- batch_size = p.shape[0]
- numel = p.numel() // batch_size
-
- if numel > 1:
- # "param_rms" just periodically records the scalar root-mean-square value of
- # the parameter tensor.
- # it has a shape like (batch_size, 1, 1, 1, 1)
- param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
- state["param_rms"] = param_rms
-
- state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
- state["scale_grads"] = torch.zeros(
- size_update_period, *param_rms.shape, **kwargs
- )
-
- # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
- state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
- def _get_clipping_scale(
- self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
- ) -> float:
- """
- Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
- by this amount before applying the rest of the update.
-
- Args:
- group: the parameter group, an item in self.param_groups
- tuples: a list of tuples of (param, state, param_names)
- where param is a batched set of parameters,
- with a .grad (1st dim is batch dim)
- and state is the state-dict where optimization parameters are kept.
- param_names is a List[str] while each str is name for a parameter
- in batched set of parameters "param".
- """
- assert len(tuples) >= 1
- clipping_scale = group["clipping_scale"]
- (first_p, first_state, _) = tuples[0]
- step = first_state["step"]
- if clipping_scale is None or step == 0:
- # no clipping. return early on step == 0 because the other
- # parameters' state won't have been initialized yet.
- return 1.0
- clipping_update_period = group["clipping_update_period"]
-
- tot_sumsq = torch.tensor(0.0, device=first_p.device)
- for (p, state, param_names) in tuples:
- grad = p.grad
- if grad.is_sparse:
- raise RuntimeError(
- "ScaledAdam optimizer does not support sparse gradients"
- )
- if p.numel() == p.shape[0]: # a batch of scalars
- tot_sumsq += (grad**2).sum() # sum() to change shape [1] to []
- else:
- tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
-
- tot_norm = tot_sumsq.sqrt()
- if "model_norms" not in first_state:
- first_state["model_norms"] = torch.zeros(
- clipping_update_period, device=p.device
- )
- first_state["model_norms"][step % clipping_update_period] = tot_norm
-
- if step % clipping_update_period == 0:
- # Print some stats.
- # We don't reach here if step == 0 because we would have returned
- # above.
- sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
- quartiles = []
- for n in range(0, 5):
- index = min(
- clipping_update_period - 1, (clipping_update_period // 4) * n
- )
- quartiles.append(sorted_norms[index].item())
-
- median = quartiles[2]
- threshold = clipping_scale * median
- first_state["model_norm_threshold"] = threshold
- percent_clipped = (
- first_state["num_clipped"] * 100.0 / clipping_update_period
- if "num_clipped" in first_state
- else 0.0
- )
- first_state["num_clipped"] = 0
- quartiles = " ".join(["%.3e" % x for x in quartiles])
- logging.info(
- f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
- f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
- )
-
- if step < clipping_update_period:
- return 1.0 # We have not yet estimated a norm to clip to.
- else:
- try:
- model_norm_threshold = first_state["model_norm_threshold"]
- except KeyError:
- logging.info(
- "Warning: model_norm_threshold not in state: possibly "
- "you changed config when restarting, adding clipping_scale option?"
- )
- return 1.0
- ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
- if ans < 1.0:
- first_state["num_clipped"] += 1
- if ans < 0.1:
- logging.warn(
- f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
- )
- if self.show_dominant_parameters:
- assert p.shape[0] == len(param_names)
- self._show_gradient_dominating_parameter(tuples, tot_sumsq)
- return ans
-
- def _show_gradient_dominating_parameter(
- self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor
- ):
- """
- Show information of parameter which dominates tot_sumsq.
-
- Args:
- tuples: a list of tuples of (param, state, param_names)
- where param is a batched set of parameters,
- with a .grad (1st dim is batch dim)
- and state is the state-dict where optimization parameters are kept.
- param_names is a List[str] while each str is name for a parameter
- in batched set of parameters "param".
- tot_sumsq: sumsq of all parameters. Though it's could be calculated
- from tuples, we still pass it to save some time.
- """
- all_sumsq_orig = {}
- for (p, state, batch_param_names) in tuples:
- # p is a stacked batch parameters.
- batch_grad = p.grad
- if p.numel() == p.shape[0]: # a batch of scalars
- batch_sumsq_orig = batch_grad**2
- # Dummy values used by following `zip` statement.
- batch_rms_orig = torch.ones(p.shape[0])
- else:
- batch_rms_orig = state["param_rms"]
- batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(
- dim=list(range(1, batch_grad.ndim))
- )
-
- for name, sumsq_orig, rms, grad in zip(
- batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
- ):
-
- proportion_orig = sumsq_orig / tot_sumsq
- all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
-
- assert torch.isclose(
- sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
- torch.tensor(1.0),
- )
- sorted_by_proportion = {
- k: v
- for k, v in sorted(
- all_sumsq_orig.items(), key=lambda item: item[1][0], reverse=True
- )
- }
- dominant_param_name = next(iter(sorted_by_proportion))
- (
- dominant_proportion,
- dominant_sumsq,
- dominant_rms,
- dominant_grad,
- ) = sorted_by_proportion[dominant_param_name]
- logging.info(
- f"Parameter dominating tot_sumsq {dominant_param_name}"
- f" with proportion {dominant_proportion:.2f},"
- f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
- f"={dominant_sumsq:.3e},"
- f" grad_sumsq={(dominant_grad**2).sum():.3e},"
- f" orig_rms_sq={(dominant_rms**2).item():.3e}"
- )
-
- def _step_one_batch(
- self, group: dict, p: Tensor, state: dict, clipping_scale: float
- ):
- """
- Do the step for one parameter, which is actually going to be a batch of
- `real` parameters, with dim 0 as the batch dim.
- Args:
- group: dict to look up configuration values
- p: parameter to update (actually multiple parameters stacked together
- as a batch)
- state: state-dict for p, to look up the optimizer state
- """
- lr = group["lr"]
- size_update_period = group["size_update_period"]
- beta1 = group["betas"][0]
-
- grad = p.grad
- if clipping_scale != 1.0:
- grad = grad * clipping_scale
- step = state["step"]
- delta = state["delta"]
-
- delta.mul_(beta1)
- batch_size = p.shape[0]
- numel = p.numel() // batch_size
- if numel > 1:
- # Update the size/scale of p, and set param_rms
- scale_grads = state["scale_grads"]
- scale_grads[step % size_update_period] = (p * grad).sum(
- dim=list(range(1, p.ndim)), keepdim=True
- )
- if step % size_update_period == size_update_period - 1:
- param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..)
- param_rms.copy_(
- (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
- )
- if step > 0:
- # self._size_update() learns the overall scale on the
- # parameter, by shrinking or expanding it.
- self._size_update(group, scale_grads, p, state)
-
- if numel == 1:
- # For parameters with 1 element we just use regular Adam.
- # Updates delta.
- self._step_scalar(group, p, state)
- else:
- self._step(group, p, state)
-
- state["step"] = step + 1
-
- def _size_update(
- self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
- ) -> None:
- """
- Called only where p.numel() > 1, this updates the scale of the parameter.
- If we imagine: p = underlying_param * scale.exp(), and we are doing
- gradient descent on underlying param and on scale, this function does the update
- on `scale`.
-
- Args:
- group: dict to look up configuration values
- scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
- grads w.r.t. the scales.
- p: The parameter to update
- state: The state-dict of p
- """
-
- param_rms = state["param_rms"]
- beta1, beta2 = group["betas"]
- size_lr = group["lr"] * group["scalar_lr_scale"]
- param_min_rms = group["param_min_rms"]
- param_max_rms = group["param_max_rms"]
- eps = group["eps"]
- step = state["step"]
- batch_size = p.shape[0]
-
- size_update_period = scale_grads.shape[0]
- # correct beta2 for the size update period: we will have
- # faster decay at this level.
- beta2_corr = beta2**size_update_period
-
- scale_exp_avg_sq = state["scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..)
- scale_exp_avg_sq.mul_(beta2_corr).add_(
- (scale_grads**2).mean(dim=0), # mean over dim `size_update_period`
- alpha=1 - beta2_corr,
- ) # shape is (batch_size, 1, 1, ...)
-
- # The 1st time we reach here is when size_step == 1.
- size_step = (step + 1) // size_update_period
- bias_correction2 = 1 - beta2_corr**size_step
- # we don't bother with bias_correction1; this will help prevent divergence
- # at the start of training.
-
- denom = scale_exp_avg_sq.sqrt() + eps
-
- scale_step = (
- -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
- )
-
- is_too_small = param_rms < param_min_rms
-
- # when the param gets too small, just don't shrink it any further.
- scale_step.masked_fill_(is_too_small, 0.0)
-
- # and ensure the parameter rms after update never exceeds param_max_rms.
- # We have to look at the trained model for parameters at or around the
- # param_max_rms, because sometimes they can indicate a problem with the
- # topology or settings.
- scale_step = torch.minimum(scale_step,
- (param_max_rms - param_rms) / param_rms)
-
- delta = state["delta"]
- # the factor of (1-beta1) relates to momentum.
- delta.add_(p * scale_step, alpha=(1 - beta1))
-
- def _step(self, group: dict, p: Tensor, state: dict):
- """
- This function does the core update of self.step(), in the case where the members of
- the batch have more than 1 element.
-
- Args:
- group: A dict which will be used to look up configuration values
- p: The parameter to be updated
- grad: The grad of p
- state: The state-dict corresponding to parameter p
-
- This function modifies p.
- """
- grad = p.grad
- lr = group["lr"]
- beta1, beta2 = group["betas"]
- eps = group["eps"]
- param_min_rms = group["param_min_rms"]
- step = state["step"]
-
- exp_avg_sq = state["exp_avg_sq"]
- exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
-
- this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
- bias_correction2 = 1 - beta2 ** (this_step + 1)
- if bias_correction2 < 0.99:
- # note: not in-place.
- exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
-
- denom = exp_avg_sq.sqrt()
- denom += eps
- grad = grad / denom
-
- alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
-
- delta = state["delta"]
- delta.add_(grad * alpha)
- p.add_(delta)
-
- def _step_scalar(self, group: dict, p: Tensor, state: dict):
- """
- A simplified form of the core update for scalar tensors, where we cannot get a good
- estimate of the parameter rms.
- """
- beta1, beta2 = group["betas"]
- scalar_max = group["scalar_max"]
- eps = group["eps"]
- lr = group["lr"] * group["scalar_lr_scale"]
- grad = p.grad
-
- exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,)
- exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-
- # bias_correction2 is like in Adam. Don't bother with bias_correction1;
- # slower update at the start will help stability anyway.
- bias_correction2 = 1 - beta2 ** (state["step"] + 1)
- denom = (exp_avg_sq / bias_correction2).sqrt() + eps
-
- delta = state["delta"]
- delta.add_(grad / denom, alpha=-lr * (1 - beta1))
- p.clamp_(min=-scalar_max, max=scalar_max)
- p.add_(delta)
-
-
-class LRScheduler(object):
- """
- Base-class for learning rate schedulers where the learning-rate depends on both the
- batch and the epoch.
- """
-
- def __init__(self, optimizer: Optimizer, verbose: bool = False):
- # Attach optimizer
- if not isinstance(optimizer, Optimizer):
- raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
- self.optimizer = optimizer
- self.verbose = verbose
-
- for group in optimizer.param_groups:
- group.setdefault("base_lr", group["lr"])
-
- self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
-
- self.epoch = 0
- self.batch = 0
-
- def state_dict(self):
- """Returns the state of the scheduler as a :class:`dict`.
-
- It contains an entry for every variable in self.__dict__ which
- is not the optimizer.
- """
- return {
- "base_lrs": self.base_lrs,
- "epoch": self.epoch,
- "batch": self.batch,
- }
-
- def load_state_dict(self, state_dict):
- """Loads the schedulers state.
-
- Args:
- state_dict (dict): scheduler state. Should be an object returned
- from a call to :meth:`state_dict`.
- """
- self.__dict__.update(state_dict)
-
- def get_last_lr(self) -> List[float]:
- """Return last computed learning rate by current scheduler. Will be a list of float."""
- return self._last_lr
-
- def get_lr(self):
- # Compute list of learning rates from self.epoch and self.batch and
- # self.base_lrs; this must be overloaded by the user.
- # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
- raise NotImplementedError
-
- def step_batch(self, batch: Optional[int] = None) -> None:
- # Step the batch index, or just set it. If `batch` is specified, it
- # must be the batch index from the start of training, i.e. summed over
- # all epochs.
- # You can call this in any order; if you don't provide 'batch', it should
- # of course be called once per batch.
- if batch is not None:
- self.batch = batch
- else:
- self.batch = self.batch + 1
- self._set_lrs()
-
- def step_epoch(self, epoch: Optional[int] = None):
- # Step the epoch index, or just set it. If you provide the 'epoch' arg,
- # you should call this at the start of the epoch; if you don't provide the 'epoch'
- # arg, you should call it at the end of the epoch.
- if epoch is not None:
- self.epoch = epoch
- else:
- self.epoch = self.epoch + 1
- self._set_lrs()
-
- def _set_lrs(self):
- values = self.get_lr()
- assert len(values) == len(self.optimizer.param_groups)
-
- for i, data in enumerate(zip(self.optimizer.param_groups, values)):
- param_group, lr = data
- param_group["lr"] = lr
- self.print_lr(self.verbose, i, lr)
- self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
-
- def print_lr(self, is_verbose, group, lr):
- """Display the current learning rate."""
- if is_verbose:
- logging.info(
- f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
- f" of group {group} to {lr:.4e}."
- )
-
-
-class Eden(LRScheduler):
- """
- Eden scheduler.
- The basic formula (before warmup) is:
- lr = base_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
- (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) * warmup
- where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
- and then stays constant at 1.
-
-
- E.g. suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
-
- Args:
- optimizer: the optimizer to change the learning rates on
- lr_batches: the number of batches after which we start significantly
- decreasing the learning rate, suggest 5000.
- lr_epochs: the number of epochs after which we start significantly
- decreasing the learning rate, suggest 6 if you plan to do e.g.
- 20 to 40 epochs, but may need smaller number if dataset is huge
- and you will do few epochs.
- """
-
- def __init__(
- self,
- optimizer: Optimizer,
- lr_batches: Union[int, float],
- lr_epochs: Union[int, float],
- warmup_batches: Union[int, float] = 500.0,
- warmup_start: float = 0.5,
- verbose: bool = False,
- ):
- super(Eden, self).__init__(optimizer, verbose)
- self.lr_batches = lr_batches
- self.lr_epochs = lr_epochs
- self.warmup_batches = warmup_batches
-
- assert 0.0 <= warmup_start <= 1.0, warmup_start
- self.warmup_start = warmup_start
-
- def get_lr(self):
- factor = (
- (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
- ) ** -0.25 * (
- ((self.epoch**2 + self.lr_epochs**2) / self.lr_epochs**2) ** -0.25
- )
- warmup_factor = (
- 1.0
- if self.batch >= self.warmup_batches
- else self.warmup_start + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
- # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
- )
-
- return [x * factor * warmup_factor for x in self.base_lrs]
-
-
-def _test_eden():
- m = torch.nn.Linear(100, 100)
- optim = ScaledAdam(m.parameters(), lr=0.03)
-
- scheduler = Eden(optim, lr_batches=100, lr_epochs=2, verbose=True)
-
- for epoch in range(10):
- scheduler.step_epoch(epoch) # sets epoch to `epoch`
-
- for step in range(20):
- x = torch.randn(200, 100).detach()
- x.requires_grad = True
- y = m(x)
- dy = torch.randn(200, 100).detach()
- f = (y * dy).sum()
- f.backward()
-
- optim.step()
- scheduler.step_batch()
- optim.zero_grad()
-
- logging.info(f"last lr = {scheduler.get_last_lr()}")
- logging.info(f"state dict = {scheduler.state_dict()}")
-
-
-# This is included mostly as a baseline for ScaledAdam.
-class Eve(Optimizer):
- """
- Implements Eve algorithm. This is a modified version of AdamW with a special
- way of setting the weight-decay / shrinkage-factor, which is designed to make the
- rms of the parameters approach a particular target_rms (default: 0.1). This is
- for use with networks with 'scaled' versions of modules (see scaling.py), which
- will be close to invariant to the absolute scale on the parameter matrix.
-
- The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
- The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
- Eve is unpublished so far.
-
- Arguments:
- params (iterable): iterable of parameters to optimize or dicts defining
- parameter groups
- lr (float, optional): learning rate (default: 1e-3)
- betas (Tuple[float, float], optional): coefficients used for computing
- running averages of gradient and its square (default: (0.9, 0.999))
- eps (float, optional): term added to the denominator to improve
- numerical stability (default: 1e-8)
- weight_decay (float, optional): weight decay coefficient (default: 3e-4;
- this value means that the weight would decay significantly after
- about 3k minibatches. Is not multiplied by learning rate, but
- is conditional on RMS-value of parameter being > target_rms.
- target_rms (float, optional): target root-mean-square value of
- parameters, if they fall below this we will stop applying weight decay.
-
-
- .. _Adam: A Method for Stochastic Optimization:
- https://arxiv.org/abs/1412.6980
- .. _Decoupled Weight Decay Regularization:
- https://arxiv.org/abs/1711.05101
- .. _On the Convergence of Adam and Beyond:
- https://openreview.net/forum?id=ryQu7f-RZ
- """
-
- def __init__(
- self,
- params,
- lr=1e-3,
- betas=(0.9, 0.98),
- eps=1e-8,
- weight_decay=1e-3,
- target_rms=0.1,
- ):
- if not 0.0 <= lr:
- raise ValueError("Invalid learning rate: {}".format(lr))
- if not 0.0 <= eps:
- raise ValueError("Invalid epsilon value: {}".format(eps))
- if not 0.0 <= betas[0] < 1.0:
- raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
- if not 0.0 <= betas[1] < 1.0:
- raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
- if not 0 <= weight_decay <= 0.1:
- raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
- if not 0 < target_rms <= 10.0:
- raise ValueError("Invalid target_rms value: {}".format(target_rms))
- defaults = dict(
- lr=lr,
- betas=betas,
- eps=eps,
- weight_decay=weight_decay,
- target_rms=target_rms,
- )
- super(Eve, self).__init__(params, defaults)
-
- def __setstate__(self, state):
- super(Eve, self).__setstate__(state)
-
- @torch.no_grad()
- def step(self, closure=None):
- """Performs a single optimization step.
-
- Arguments:
- closure (callable, optional): A closure that reevaluates the model
- and returns the loss.
- """
- loss = None
- if closure is not None:
- with torch.enable_grad():
- loss = closure()
-
- for group in self.param_groups:
- for p in group["params"]:
- if p.grad is None:
- continue
-
- # Perform optimization step
- grad = p.grad
- if grad.is_sparse:
- raise RuntimeError("AdamW does not support sparse gradients")
-
- state = self.state[p]
-
- # State initialization
- if len(state) == 0:
- state["step"] = 0
- # Exponential moving average of gradient values
- state["exp_avg"] = torch.zeros_like(
- p, memory_format=torch.preserve_format
- )
- # Exponential moving average of squared gradient values
- state["exp_avg_sq"] = torch.zeros_like(
- p, memory_format=torch.preserve_format
- )
-
- exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-
- beta1, beta2 = group["betas"]
-
- state["step"] += 1
- bias_correction1 = 1 - beta1 ** state["step"]
- bias_correction2 = 1 - beta2 ** state["step"]
-
- # Decay the first and second moment running average coefficient
- exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
- exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
- denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
- group["eps"]
- )
-
- step_size = group["lr"] / bias_correction1
- target_rms = group["target_rms"]
- weight_decay = group["weight_decay"]
-
- if p.numel() > 1:
- # avoid applying this weight-decay on "scaling factors"
- # (which are scalar).
- is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
- p.mul_(1 - (weight_decay * is_above_target_rms))
-
- p.addcdiv_(exp_avg, denom, value=-step_size)
-
- if random.random() < 0.0005:
- step = (exp_avg / denom) * step_size
- logging.info(
- f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
- )
-
- return loss
-
-
-def _test_scaled_adam(hidden_dim: int):
- import timeit
-
- from scaling import ScaledLinear
-
- E = 100
- B = 4
- T = 2
- logging.info("in test_eve_cain")
- # device = torch.device('cuda')
- device = torch.device("cpu")
- dtype = torch.float32
-
- fix_random_seed(42)
- # these input_magnitudes and output_magnitudes are to test that
- # Abel is working as we expect and is able to adjust scales of
- # different dims differently.
- input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
- output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-
- for iter in [1, 0]:
- fix_random_seed(42)
- Linear = torch.nn.Linear if iter == 0 else ScaledLinear
-
- m = torch.nn.Sequential(
- Linear(E, hidden_dim),
- torch.nn.PReLU(),
- Linear(hidden_dim, hidden_dim),
- torch.nn.PReLU(),
- Linear(hidden_dim, E),
- ).to(device)
-
- train_pairs = [
- (
- 100.0
- * torch.randn(B, T, E, device=device, dtype=dtype)
- * input_magnitudes,
- torch.randn(B, T, E, device=device, dtype=dtype) * output_magnitudes,
- )
- for _ in range(20)
- ]
-
- if iter == 0:
- optim = Eve(m.parameters(), lr=0.003)
- elif iter == 1:
- optim = ScaledAdam(m.parameters(), lr=0.03, clipping_scale=2.0)
- scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)
-
- start = timeit.default_timer()
- avg_loss = 0.0
- for epoch in range(180):
- scheduler.step_epoch()
- # if epoch == 100 and iter in [2,3]:
- # optim.reset_speedup() # check it doesn't crash.
-
- # if epoch == 130:
- # opts = diagnostics.TensorDiagnosticOptions(
- # 2 ** 22
- # ) # allow 4 megabytes per sub-module
- # diagnostic = diagnostics.attach_diagnostics(m, opts)
-
- for n, (x, y) in enumerate(train_pairs):
- y_out = m(x)
- loss = ((y_out - y) ** 2).mean() * 100.0
- if epoch == 0 and n == 0:
- avg_loss = loss.item()
- else:
- avg_loss = 0.98 * avg_loss + 0.02 * loss.item()
- if n == 0 and epoch % 5 == 0:
- # norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
- # norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()
- # norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item()
- # norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item()
- # scale1 = '%.2e' % (m[0].weight_scale.exp().item())
- # scale1b = '%.2e' % (m[0].bias_scale.exp().item())
- # scale2 = '%.2e' % (m[2].weight_scale.exp().item())
- # scale2b = '%.2e' % (m[2].bias_scale.exp().item())
- lr = scheduler.get_last_lr()[0]
- logging.info(
- f"Iter {iter}, epoch {epoch}, batch {n}, avg_loss {avg_loss:.4g}, lr={lr:.4e}"
- ) # , norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b}
- loss.log().backward()
- optim.step()
- optim.zero_grad()
- scheduler.step_batch()
-
- # diagnostic.print_diagnostics()
-
- stop = timeit.default_timer()
- logging.info(f"Iter={iter}, Time taken: {stop - start}")
-
- logging.info(f"last lr = {scheduler.get_last_lr()}")
- # logging.info("state dict = ", scheduler.state_dict())
- # logging.info("optim state_dict = ", optim.state_dict())
- logging.info(f"input_magnitudes = {input_magnitudes}")
- logging.info(f"output_magnitudes = {output_magnitudes}")
-
-
-if __name__ == "__main__":
- torch.set_num_threads(1)
- torch.set_num_interop_threads(1)
- logging.getLogger().setLevel(logging.INFO)
- import subprocess
-
- s = subprocess.check_output(
- "git status -uno .; git log -1; git diff HEAD .", shell=True
- )
- logging.info(s)
- import sys
-
- if len(sys.argv) > 1:
- hidden_dim = int(sys.argv[1])
- else:
- hidden_dim = 200
-
- _test_scaled_adam(hidden_dim)
- _test_eden()
diff --git a/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py b/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py
deleted file mode 100644
index d13cc08d0..000000000
--- a/egs/aishell/ASR/seamlessm4t/patch/sequence_generator.py
+++ /dev/null
@@ -1,694 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union, cast
-
-import torch
-from torch import Tensor
-from torch.nn.functional import log_softmax
-
-from fairseq2.data import Collater, SequenceData, VocabularyInfo
-from fairseq2.generation.beam_search import BeamSearch, StandardBeamSearch
-from fairseq2.generation.logits_processor import LogitsProcessor
-from fairseq2.models.encoder_decoder import Seq2SeqDecoder
-from fairseq2.nn.incremental_state import IncrementalStateBag
-from fairseq2.typing import Device
-
-
-@dataclass
-class SequenceGeneratorOptions:
- """Holds the options to pass to a sequence generator."""
-
- beam_size: int = 5
- """The beam size."""
-
- min_seq_len: int = 1
- """The minimum length of generated sequences (including prefix sequence)."""
-
- soft_max_seq_len: Optional[Tuple[int, int]] = (1, 200)
- """The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source
- sequence length. The generated sequences (including prefix sequence) will
- have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
- ``hard_max_seq_len``."""
-
- hard_max_seq_len: int = 1024
- """The hard limit on maximum length of generated sequences."""
-
- len_penalty: float = 1.0
- """The length penalty, where values less than 1.0 favor shorter, values
- greater than 1.0 favor longer sequences."""
-
- unk_penalty: float = 0.0
- """The unknown symbol penalty, where values less than 0 produce more UNKs,
- values greater than 0 produce fewer UNKs."""
-
- normalize_scores: bool = True
- """If ``True``, normalizes scores by the length of generated sequences."""
-
- search: Optional[BeamSearch] = None
- """The beam search algorithm to use."""
-
- logits_processor: Optional[LogitsProcessor] = None
- """Logits processor called before applying beam search step."""
-
-
-class Seq2SeqGenerator:
- """Represents a sequence-to-sequence generator."""
-
- decoder: Seq2SeqDecoder
- opts: SequenceGeneratorOptions
- beam_size: int
- eos_idx: int
- pad_idx: Optional[int]
- unk_idx: Optional[int]
- prefix_seq: Union[int, Tensor]
- prefix_seq_len: int
- search: BeamSearch
- logits_processor: Optional[LogitsProcessor]
- collater: Collater
-
- def __init__(
- self,
- decoder: Seq2SeqDecoder,
- vocab_info: VocabularyInfo,
- prefix_seq: Optional[Union[int, Tensor]],
- opts: Optional[SequenceGeneratorOptions] = None,
- ) -> None:
- """
- :param decoder:
- The decoder to use.
- :param vocab_info:
- The vocabulary information to use.
- :param prefix_seq:
- The prefix sequence, typically one or more control symbols
- indicating the beginning of a sequence. *Shape:* :math:`()` or
- :math:`(S)`, where :math:`S` is the sequence length. If ``None``,
- the EOS symbol will be used as prefix.
- :param opts:
- The generation options.
- """
- self.decoder = decoder
-
- self.opts = opts or SequenceGeneratorOptions()
-
- # Set beam size.
- if vocab_info.pad_idx is None:
- self.beam_size = min(self.opts.beam_size, vocab_info.size)
- else:
- # -1 since we never select PAD.
- self.beam_size = min(self.opts.beam_size, vocab_info.size - 1)
-
- if vocab_info.eos_idx is None:
- raise ValueError(
- "`vocab_info` must have `eos_idx` set for sequence generation."
- )
-
- # Set vocab info.
- self.eos_idx = 1
- #self.eos_idx = vocab_info.eos_idx
- self.unk_idx = 2
- #self.unk_idx = vocab_info.unk_idx
- self.pad_idx = 0
- #self.pad_idx = vocab_info.pad_idx
-
- # Set prefix sequence.
- if 1:
- #if prefix_seq is None:
- # If `None`, we follow fairseq's convention, and use EOS as the
- # prefix.
- self.prefix_seq, self.prefix_seq_len = self.eos_idx, 1
- else:
- self.prefix_seq = prefix_seq
-
- if isinstance(prefix_seq, Tensor):
- num_dim = prefix_seq.dim()
-
- if num_dim >= 2:
- raise ValueError(
- f"`prefix_seq` must be a scalar or a 1-dimensional tensor, but is {num_dim}-dimensional instead."
- )
-
- self.prefix_seq_len = 1 if num_dim == 0 else prefix_seq.size(0)
- else:
- self.prefix_seq_len = 1
-
- # Set beam search.
- self.search = self.opts.search or StandardBeamSearch()
- self.logits_processor = self.opts.logits_processor
-
- if vocab_info.pad_idx is None:
- self.collater = Collater()
- else:
- self.collater = Collater(self.pad_idx, pad_to_multiple=2)
-
- @torch.inference_mode()
- def __call__(
- self,
- encoder_output: Tensor,
- encoder_padding_mask: Optional[Tensor],
- source_seq_len: Optional[int] = None,
- ) -> "SequenceGeneratorOutput":
- opts = self.opts
-
- num_searches = encoder_output.size(0)
-
- beam_size = opts.beam_size
-
- max_seq_len = self._determine_max_seq_len(source_seq_len)
-
- device = encoder_output.device
-
- encoder_output, encoder_padding_mask = self._fan_out_encoder_output(
- encoder_output, encoder_padding_mask
- )
-
- # Each element contains the id of the search corresponding to a single
- # source sequence and its hypotheses.
- active_searches: List[Tuple[int, List[Hypothesis]]] = [
- (search_idx, []) for search_idx in range(num_searches)
- ]
-
- # Once a source sequence has `beam_size` hypotheses, its search is moved
- # from `active_searches` to `finished_searches`.
- finished_searches: List[List[Hypothesis]] = [[] for i in range(num_searches)]
-
- num_remaining_searches = num_searches
-
- # Initialize buffers.
- # (N x B, S)
- seqs = torch.zeros(
- (num_searches * beam_size, max_seq_len), device=device, dtype=torch.int64
- )
-
- # (N x B, S)
- scores = torch.zeros(
- (num_searches * beam_size, max_seq_len), device=device, dtype=torch.float32
- )
-
- # A list that indicates beams that should be ignored in the next step.
- ignored_beam_mask = torch.full(
- (num_searches, beam_size), False, device=device, dtype=torch.bool
- )
-
- # An offset array for converting between batch-wide and search-local
- # beam indices.
- # (B)
- search_offsets = torch.arange(num_searches, device=device) * beam_size
-
- # (B) -> (B, 1)
- search_offsets.unsqueeze_(-1)
-
- cand_offsets = torch.arange(2 * beam_size, device=device)
-
- state_bag = IncrementalStateBag()
-
- # At this point, the state is fully initialized, kick off the search.
- self._bootstrap_seqs_and_scores(
- seqs, scores, encoder_output, encoder_padding_mask, state_bag
- )
-
- start_step = self.prefix_seq_len - 1
-
- # Holds the indices of beams (a beam can occur more than once) that we
- # should continue with in the next step.
- beam_indices: Optional[Tensor] = None
-
- # Holds the indices of searches that we should continue with in the next
- # step. If not `None`, it means we finalized one or more searches in the
- # last step.
- search_indices: Optional[Tensor] = None
-
- for step_nr in range(start_step, max_seq_len - 1):
- if beam_indices is not None:
- # If not `None`, it means in the last step we finalized one or
- # more searches. We should ensure that we adjust `beam_indices`
- # before reordering `decoder`'s incremental state.
- if search_indices is not None:
- num_searches = search_indices.numel()
-
- # (N)
- delta = search_indices - torch.arange(num_searches, device=device)
-
- # (N) -> (N, 1)
- delta.unsqueeze_(-1)
-
- # Adjust indices to take into account removed searches.
- beam_indices.view(num_searches, beam_size).add_(delta * beam_size)
-
- state_bag.reorder(beam_indices)
-
- decoder_output, decoder_padding_mask = self.decoder.decode(
- seqs[:, step_nr : step_nr + 1],
- None, # We never generate PAD.
- encoder_output,
- encoder_padding_mask,
- state_bag,
- )
-
- state_bag.increment_step()
-
- model_output = self.decoder.project(decoder_output, decoder_padding_mask)
-
- # lprobs: (1, V)
- # model_output: (N, 1, V)
- lprobs = log_softmax(model_output.logits, dim=-1, dtype=torch.float32)
-
- # Do not allow EOS before reaching the minimum sequence length.
- if step_nr < self.opts.min_seq_len:
- lprobs[:, :, self.eos_idx] = -torch.inf
-
- # fmt: off
- # If we have reached the maximum length, force the last step to be
- # EOS.
- if step_nr == max_seq_len - 2:
- lprobs[:, :, : self.eos_idx] = -torch.inf
- lprobs[:, :, self.eos_idx + 1 :] = -torch.inf
- # fmt: on
-
- # Never allow PAD.
- if self.pad_idx is not None:
- lprobs[:, :, self.pad_idx] = -torch.inf
-
- # Apply UNK penalty.
- if self.unk_idx is not None:
- lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
-
- # update scores in place using logits_processor
- if self.logits_processor is not None:
- self.logits_processor(
- seqs.view(num_searches, beam_size, -1)[:, :, : step_nr + 1],
- lprobs.view(num_searches, beam_size, -1),
- )
-
- # Determine candidates for the next step.
- # (N, 2 x B)
- cand_scores, cand_indices, cand_beam_indices = self.search.step(
- step_nr,
- step_nr == start_step,
- lprobs.view(num_searches, beam_size, -1),
- scores.view(num_searches, beam_size, -1)[:, :, : step_nr + 1],
- )
-
- # Convert search-local beam indices to batch-wide beam indices.
- # (N, 2 x B) + (N) -> (N, 2 x B)
- global_cand_beam_indices = cand_beam_indices + search_offsets
-
- # Finalize beams that reached the minimum length and that end with
- # an EOS.
- # (N, 2 x B)
- eos_mask = (cand_indices == self.eos_idx) & (cand_scores != -math.inf)
-
- # Do not attempt to finalize beams that should be ignored.
- eos_mask[:, :beam_size][ignored_beam_mask] = False
-
- # Only consider EOS when it's among the top `beam_size` indices. Now
- # we know what beam(s) to finalize.
- # (N, B)
- eos_beam_indices = torch.masked_select(
- global_cand_beam_indices[:, :beam_size], mask=eos_mask[:, :beam_size]
- )
-
- if eos_beam_indices.numel() > 0:
- # Select the scores of the finalized beams.
- # (N, B)
- eos_scores = torch.masked_select(
- cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
- )
-
- newly_finished_searches = self._finalize_hypothesis(
- step_nr,
- eos_beam_indices,
- eos_scores,
- seqs,
- scores,
- active_searches,
- finished_searches,
- )
-
- num_remaining_searches -= len(newly_finished_searches)
-
- if num_remaining_searches == 0:
- break
- else:
- newly_finished_searches = None
-
- # Remove finished searches (ones for which `beam_size` finalized
- # beams have been generated) from the batch.
- if newly_finished_searches:
- new_num_searches = num_searches - len(newly_finished_searches)
-
- # Construct `search_indices` which holds indices of searches
- # to keep for the next step.
- search_mask = torch.full((num_searches,), True, device=device)
-
- search_mask[newly_finished_searches] = False
-
- search_indices = torch.arange(num_searches, device=device)
-
- search_indices = search_indices.masked_select(search_mask)
-
- # fmt: off
- # Filter out removed batches from state variables.
- # (N, B) -> (N - F, B)
- ignored_beam_mask = ignored_beam_mask[search_indices]
-
- # (N, 2 x B) -> (N - F, 2 x B)
- cand_scores = cand_scores [search_indices]
- cand_indices = cand_indices [search_indices]
- cand_beam_indices = cand_beam_indices[search_indices]
-
- # (N) -> (N - F)
- search_offsets.resize_(new_num_searches, 1)
-
- # (N - F, 2 x B) + (N - F) -> (N - F, 2 x B)
- global_cand_beam_indices = cand_beam_indices + search_offsets
-
- # (N, 2 x B) -> (N - F, 2 x B)
- eos_mask = eos_mask[search_indices]
-
- # (N x B, S) -> (N, B, S)
- seqs = seqs .view(num_searches, -1)
- scores = scores.view(num_searches, -1)
-
- # (N, B, S + 1) -> ((N - F) x B, S)
- seqs = seqs [search_indices].view(new_num_searches * beam_size, -1)
- scores = scores[search_indices].view(new_num_searches * beam_size, -1)
-
- # (N x B, S_enc, M) -> (N, B, S_enc, M)
- encoder_output = encoder_output.unflatten(0, (num_searches, -1))
-
- # (N, B, S_enc, M) -> ((N - F) x B, S_enc, M)
- encoder_output = encoder_output[search_indices].flatten(0, 1)
-
- if encoder_padding_mask is not None:
- # (N x B, S_enc, M) -> (N, B, S_enc, M)
- padding_mask = encoder_padding_mask.unflatten(0, (num_searches, -1))
-
- # (N, B, S_enc, M) -> ((N - F) x B, S_enc, M)
- encoder_padding_mask = padding_mask[search_indices].flatten(0, 1)
- # fmt: on
-
- num_searches = new_num_searches
- else:
- search_indices = None
-
- eos_mask[:, :beam_size][ignored_beam_mask] = True
-
- # Set `beam_weights` so that values greater than or equal to 2 x
- # `beam_size` indicate finished beams (i.e. end with EOS) and values
- # less than 2 x `beam_size` indicate active beams.
- # (N, 2 x B)
- beam_weights = cand_offsets + (eos_mask * (2 * beam_size))
-
- # Get the top `beam_size` active beams, which are the beams with the
- # smallest weights in `active_beam_weights`.
- # (N, B)
- active_beam_weights, active_beams = torch.topk(
- beam_weights, k=beam_size, dim=1, largest=False
- )
-
- # Update to ignore finalized beams in the next step.
- # (N, B)
- ignored_beam_mask = active_beam_weights >= 2 * beam_size
-
- # We should always have at least one active beam in each search.
- assert (~ignored_beam_mask).any(dim=1).all()
-
- # Denotes which beams are continued for each new hypothesis (a beam
- # can be selected more than once).
- # (N, B)
- beam_indices = torch.gather(
- global_cand_beam_indices, dim=1, index=active_beams
- )
-
- # (N, B) -> (N x B)
- beam_indices = beam_indices.view(-1)
-
- # fmt: off
- # Reorder beams in the `seq` and `score` buffers. The same beam can
- # be selected more than once.
- if step_nr > start_step:
- seqs [:, : step_nr + 1] = torch.index_select(
- seqs [:, : step_nr + 1], dim=0, index=beam_indices
- )
- scores[:, : step_nr + 1] = torch.index_select(
- scores[:, : step_nr + 1], dim=0, index=beam_indices
- )
-
- # (N x B, S) -> (N, B, S)
- seqs_view = seqs .view(num_searches, beam_size, -1)
- scores_view = scores.view(num_searches, beam_size, -1)
-
- seqs_view [:, :, step_nr + 1] = torch.gather(cand_indices, dim=1, index=active_beams)
- scores_view[:, :, step_nr + 1] = torch.gather(cand_scores, dim=1, index=active_beams)
- # fmt: on
-
- # Ensure that hypotheses are sorted by their scores before returning.
- for batch in finished_searches:
- batch.sort(key=lambda b: b.score, reverse=True) # type: ignore[arg-type, return-value]
-
- return SequenceGeneratorOutput(
- results=finished_searches, device=device, collater=self.collater
- )
-
- def _determine_max_seq_len(self, source_seq_len: Optional[int]) -> int:
- opts = self.opts
-
- if source_seq_len is None or opts.soft_max_seq_len is None:
- max_seq_len = opts.hard_max_seq_len
- else:
- at, bt = opts.soft_max_seq_len
-
- max_seq_len = min(opts.hard_max_seq_len, int(at * source_seq_len + bt))
-
- if opts.min_seq_len > max_seq_len:
- raise ValueError(
- f"The effective maximum sequence length must be greater than or equal to `min_seq_len` ({opts.min_seq_len}), but is {max_seq_len} instead. Adjust your soft and hard maximum sequence length limits."
- )
-
- if self.prefix_seq_len >= max_seq_len:
- raise ValueError(
- f"The effective maximum sequence length must be greater than `prefix_seq_len` ({self.prefix_seq_len}), but is {max_seq_len} instead."
- )
-
- return max_seq_len
-
- def _fan_out_encoder_output(
- self, encoder_output: Tensor, encoder_padding_mask: Optional[Tensor]
- ) -> Tuple[Tensor, Optional[Tensor]]:
- num_searches = encoder_output.size(0) # i.e. batch size
-
- # Fan out `encoder_output` to `num_searches` x `beam_size`.
- # (N)
- fan_out_indices = torch.arange(num_searches, device=encoder_output.device)
-
- # (N) -> (N x B)
- fan_out_indices = fan_out_indices.repeat_interleave(self.beam_size)
-
- # (N, S_enc, M) -> (N x B, S_enc, M)
- encoder_output = encoder_output.index_select(dim=0, index=fan_out_indices)
-
- # (N, S_enc, M) -> (N x B, S_enc, M)
- if encoder_padding_mask is not None:
- encoder_padding_mask = encoder_padding_mask.index_select(
- dim=0, index=fan_out_indices
- )
-
- return encoder_output, encoder_padding_mask
-
- def _bootstrap_seqs_and_scores(
- self,
- seqs: Tensor,
- scores: Tensor,
- encoder_output: Tensor,
- encoder_padding_mask: Optional[Tensor],
- state_bag: IncrementalStateBag,
- ) -> None:
- assert self.prefix_seq_len > 0
-
- seqs[:, : self.prefix_seq_len] = self.prefix_seq
-
- if self.prefix_seq_len == 1:
- return
-
- assert isinstance(self.prefix_seq, Tensor)
-
- # We have to bootstrap the model with the already fanned-out encoder
- # output to correctly initialize its incremental state. This causes some
- # redundancy as we have to expand `decoder_input` to match the shape of
- # `encoder_output`.
- # (S_pfx) -> (N x B, S_pfx - 1)
- decoder_input = self.prefix_seq[:-1].expand(encoder_output.size(0), -1)
-
- # Bootstrap the model state with prefix sequence.
- decoder_output, decoder_padding_mask = self.decoder.decode(
- decoder_input,
- None,
- encoder_output,
- encoder_padding_mask,
- state_bag,
- )
-
- state_bag.increment_step(self.prefix_seq_len - 1)
-
- model_output = self.decoder.project(decoder_output, decoder_padding_mask)
-
- # lprobs: (S_pfx - 1, V)
- # model_output: (N, S_pfx - 1, V) -> (S_pfx - 1, V)
- lprobs = log_softmax(model_output.logits[0], dim=-1, dtype=torch.float32)
-
- # Fetch scores of next steps.
- # (S_pfx - 1, 1)
- prefix_scores = torch.take_along_dim(
- lprobs, indices=self.prefix_seq[1:].unsqueeze(1), dim=-1
- )
-
- # (S_pfx - 1, 1) -> (S_pfx - 1)
- prefix_scores.squeeze_(1).cumsum_(dim=0)
-
- # First step (e.g. EOS)'s score is always 0.
- scores[:, 1 : self.prefix_seq_len] = prefix_scores
-
- def _finalize_hypothesis(
- self,
- step_nr: int,
- eos_beam_indices: Tensor,
- eos_scores: Tensor,
- seqs: Tensor,
- scores: Tensor,
- active_searches: List[Tuple[int, List["Hypothesis"]]],
- finished_searches: List[List["Hypothesis"]],
- ) -> List[int]:
- # fmt: off
- finalized_seqs = seqs .index_select(dim=0, index=eos_beam_indices)
- finalized_scores = scores.index_select(dim=0, index=eos_beam_indices)
-
- finalized_seqs = finalized_seqs [:, : step_nr + 2]
- finalized_scores = finalized_scores[:, : step_nr + 2]
-
- # Finalize beams.
- finalized_seqs [:, -1] = self.eos_idx
- finalized_scores[:, -1] = eos_scores
- # fmt: on
-
- # Convert from cumulative to per-step scores.
- finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1]
-
- # Skip first EOS since it is always 0 and skews normalization.
- if self.opts.normalize_scores:
- eos_scores /= (step_nr + 1) ** self.opts.len_penalty
-
- # Holds the ids of finished searches.
- newly_finished: List[int] = []
-
- active_search_indices = (eos_beam_indices // self.beam_size).tolist()
-
- for beam_idx, search_idx in enumerate(active_search_indices):
- search_id, hypotheses = active_searches[search_idx]
-
- # We might have more than one beam finalized in one step that would
- # potentially exceed `beam_size` hypotheses.
- if len(hypotheses) == self.beam_size:
- continue
-
- hypotheses.append(
- Hypothesis(
- seq=finalized_seqs[beam_idx],
- score=eos_scores[beam_idx],
- step_scores=finalized_scores[beam_idx],
- )
- )
-
- if len(hypotheses) == self.beam_size:
- # We have `beam_size` hypotheses for this particular search, so
- # we finish it now.
- newly_finished.append(search_idx)
-
- finished_searches[search_id] = hypotheses
-
- newly_finished.sort()
-
- # Remove finished searches from the active list.
- for idx in reversed(newly_finished):
- del active_searches[idx]
-
- return newly_finished
-
-
-@dataclass
-class SequenceGeneratorOutput:
- """Holds the output of a sequence generator."""
-
- results: List[List["Hypothesis"]]
- """The list of hypothesis generated per search, ordered by score."""
-
- device: Device
- """The device on which generated sequences reside."""
-
- collater: Optional[Collater] = None
- """The collater to use in :meth:`collate`."""
-
- def collate(
- self, hypo_idx: int = 0, skip_batch: bool = False
- ) -> Tuple[Tensor, Optional[Tensor]]:
- """Collate the generated sequences at index ``hypo_idx`` in each search
- result into a single tensor.
-
- :param hypo_idx:
- The index of hypothesis to extract from each search result.
- :param skip_batch:
- If ``True``, if a search result has no hypothesis at index `hypo_idx`,
- it will be skipped instead of raising an error.
-
- :returns:
- - The collated sequences. *Shape:* :math:`(N,S)`, where :math:`N` is
- the number of search results and :math:`S` is the sequence length.
- - An array where each element represents the length of the sequence at
- the same index in the first returned value. *Shape:* :math:`(N)`,
- where :math:`N` is the number of search results.
- """
- if self.collater is None:
- raise RuntimeError("The output has no associated `Collater` instance.")
-
- if not self.results and not skip_batch:
- raise ValueError("The output must contain at least one search result.")
-
- seqs = []
-
- for search_idx, result in enumerate(self.results):
- if hypo_idx >= len(result):
- if not skip_batch:
- raise ValueError(
- f"Each search result must have at least {hypo_idx + 1} hypotheses, but search {search_idx} has only {len(result)}."
- )
-
- continue
-
- seqs.append(result[hypo_idx].seq)
-
- if not seqs:
- # Return a zero-dimensional (not scalar!) tensor.
- return torch.empty((0,), device=self.device, dtype=torch.int64), None
-
- output = cast(SequenceData, self.collater(seqs))
-
- return output["seqs"], output["seq_lens"] if output["is_ragged"] else None
-
-
-@dataclass
-class Hypothesis:
- """Represents a hypothesis produced by a sequence generator."""
-
- seq: Tensor
- """The generated sequence."""
-
- score: Tensor
- """The score of the hypothesis."""
-
- step_scores: Tensor
- """The score of each individual sequence step."""
diff --git a/egs/aishell/ASR/seamlessm4t/requirements.txt b/egs/aishell/ASR/seamlessm4t/requirements.txt
deleted file mode 100644
index 7647735da..000000000
--- a/egs/aishell/ASR/seamlessm4t/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-#k2
-kaldialign
-lhotse
-sentencepiece
-tensorboard
-fairseq2
diff --git a/egs/aishell/ASR/seamlessm4t/tokenizer.py b/egs/aishell/ASR/seamlessm4t/tokenizer.py
deleted file mode 100644
index 1012cd8a0..000000000
--- a/egs/aishell/ASR/seamlessm4t/tokenizer.py
+++ /dev/null
@@ -1,43 +0,0 @@
-
-#import sentencepiece as spm
-
-class CharTokenizer(object):
- def __init__(self, tokenizer_file):
- self.id2symbol = {}
- self.symbol2id = {}
- with open(tokenizer_file, 'r') as f:
- for line in f:
- line = line.strip()
- if line:
- symbol, id = line.split()
- id = int(id)
- self.id2symbol[id] = symbol
- self.symbol2id[symbol] = id
- self.vocab_size = len(self.id2symbol)
-
- def encode(self, text):
- # if symbol not in self.symbol2id, using 's id
- return [self.symbol2id.get(symbol, 2) for symbol in text]
-
- def decode(self, ids):
- return ''.join([self.id2symbol[id] for id in ids])
-
-if __name__ == '__main__':
- # config_file = './config.yaml'
- # config = read_yaml(config_file)
- # converter = TokenIDConverter(config['token_list'])
- # ids = converter.tokens2ids(['', '你', '好', '吗', '', 'microsoft', 'world'])
- # print(ids)
- # print(converter.ids2tokens(ids))
-
-
- tokenizer = CharTokenizer('./tokens.txt')
- ids = tokenizer.encode('今天 天气不错')
- print(ids)
- print(tokenizer.decode(ids+[1]))
- # sp = spm.SentencePieceProcessor()
- # sp.Load('../../../librispeech/ASR/k2fsa-zipformer-chinese-english-mixed/data/lang_char_bpe/bpe.model')
- # texts = ['MICROSOFT WORLD']
- # y = sp.encode(texts, out_type=int)
- # x = sp.decode(y)
- # print(y, x)
\ No newline at end of file
diff --git a/egs/aishell/ASR/seamlessm4t/tokens.txt b/egs/aishell/ASR/seamlessm4t/tokens.txt
deleted file mode 100644
index 980dd6cd2..000000000
--- a/egs/aishell/ASR/seamlessm4t/tokens.txt
+++ /dev/null
@@ -1,6257 +0,0 @@
- 0
- 1
- 2
-▁AS 3
-▁ONE 4
-▁OF 5
-▁A 6
-▁COMP 7
-AN 8
-Y 9
-▁SOME 10
-▁HUNDRED 11
-▁AND 12
-▁FI 13
-F 14
-T 15
-▁ME 16
-N 17
-▁WO 18
-M 19
-EN 20
-▁NOT 21
-▁LA 22
-BO 23
-UR 24
-ING 25
-▁UNDER 26
-▁ANY 27
-▁SU 28
-S 29
-P 30
-IC 31
-ION 32
-▁ 33
-LU 34
-AC 35
-▁THAT 36
-▁THE 37
-▁CO 38
-▁CHA 39
-CE 40
-RY 41
-▁THOUGH 42
-▁SH 43
-IN 44
-B 45
-J 46
-E 47
-C 48
-▁MUCH 49
-▁PO 50
-UL 51
-AR 52
-▁PRE 53
-U 54
-DI 55
-▁AT 56
-▁WHICH 57
-▁POINT 58
-▁I 59
-▁JU 60
-D 61
-GE 62
-▁E 63
-▁HAD 64
-▁CA 65
-▁IN 66
-▁MY 67
-▁DI 68
-RE 69
-CTION 70
-▁WAS 71
-L 72
-MO 73
-ATE 74
-▁THERE 75
-▁BEEN 76
-▁HE 77
-IT 78
-TED 79
-▁T 80
-RI 81
-VI 82
-AL 83
-▁B 84
-LE 85
-ISH 86
-▁OR 87
-▁SO 88
-▁IT 89
-▁RA 90
-TE 91
-▁PRO 92
-G 93
-▁BUT 94
-▁THIS 95
-▁EX 96
-A 97
-ER 98
-ATED 99
-▁EN 100
-TI 101
-LY 102
-OW 103
-▁TO 104
-▁PA 105
-R 106
-IM 107
-ON 108
-▁PUBLIC 109
-▁GU 110
-IL 111
-▁APP 112
-ED 113
-▁UN 114
-ENT 115
-▁MOST 116
-▁DE 117
-TER 118
-▁MAN 119
-▁ON 120
-▁BY 121
-▁NO 122
-▁MEAN 123
-▁NUMBER 124
-O 125
-▁BELIEVE 126
-▁RI 127
-CH 128
-▁SECOND 129
-▁OTHER 130
-▁K 131
-▁WILL 132
-▁DO 133
-▁WELL 134
-▁SEE 135
-UND 136
-▁JO 137
-KE 138
-▁BE 139
-▁BO 140
-OK 141
-▁COULD 142
-▁THEM 143
-▁THESE 144
-▁P 145
-AGE 146
-▁SHA 147
-ME 148
-IOUS 149
-▁IS 150
-▁ONLY 151
-▁WORD 152
-▁RE 153
-K 154
-I 155
-▁SHOULD 156
-▁HAVE 157
-OR 158
-ITY 159
-▁WHAT 160
-LL 161
-▁SP 162
-TA 163
-NE 164
-OUS 165
-▁COM 166
-US 167
-TION 168
-▁HAS 169
-▁S 170
-TH 171
-▁MI 172
-RO 173
-▁GOOD 174
-▁FRIEND 175
-▁LE 176
-W 177
-ES 178
-▁WITH 179
-▁WHO 180
-▁THINK 181
-▁MU 182
-IG 183
-▁MO 184
-▁MIGHT 185
-▁AN 186
-QU 187
-ATION 188
-▁FROM 189
-▁NA 190
-TURE 191
-▁ALL 192
-IES 193
-▁LET 194
-▁TIME 195
-▁WHEN 196
-▁EVEN 197
-▁NEED 198
-▁O 199
-VE 200
-▁WI 201
-FUL 202
-▁NE 203
-LI 204
-AD 205
-ERS 206
-▁BEFORE 207
-▁WORK 208
-▁LIKE 209
-▁HAND 210
-▁THEN 211
-SH 212
-▁WERE 213
-OM 214
-▁KNOW 215
-EL 216
-MA 217
-▁OVER 218
-▁LO 219
-TING 220
-▁LI 221
-OL 222
-▁HA 223
-MP 224
-LA 225
-ABLE 226
-MB 227
-▁WE 228
-▁ST 229
-ET 230
-▁IF 231
-▁WA 232
-▁NEW 233
-IR 234
-▁FA 235
-▁G 236
-IAN 237
-ND 238
-VER 239
-IS 240
-▁DIS 241
-CO 242
-UN 243
-▁SEVEN 244
-▁TH 245
-▁AFTER 246
-WARD 247
-▁RO 248
-▁WOULD 249
-▁SA 250
-▁FOR 251
-▁FE 252
-▁LONG 253
-▁STILL 254
-▁CON 255
-X 256
-▁HERE 257
-▁EVERYTHING 258
-▁SE 259
-ANT 260
-ALLY 261
-RU 262
-▁THAN 263
-ANCE 264
-▁ARE 265
-▁NEXT 266
-▁DOWN 267
-▁CHI 268
-▁MA 269
-CK 270
-▁DR 271
-Z 272
-▁F 273
-ROW 274
-▁GO 275
-▁INTO 276
-INE 277
-▁SIX 278
-▁YEAR 279
-▁HIS 280
-CI 281
-ENCE 282
-▁CLOSE 283
-▁LAST 284
-▁C 285
-▁MORE 286
-▁THOUSAND 287
-LO 288
-▁UP 289
-▁WANT 290
-▁JA 291
-▁GA 292
-▁THROUGH 293
-▁PLACE 294
-▁HU 295
-MAN 296
-UGH 297
-IST 298
-▁HO 299
-▁BETTER 300
-▁THEIR 301
-▁VERY 302
-NG 303
-▁ANOTHER 304
-▁UM 305
-AS 306
-▁TE 307
-V 308
-▁HER 309
-▁HIGH 310
-QUI 311
-▁BECAUSE 312
-▁SHOW 313
-▁WHERE 314
-▁DAY 315
-▁BRO 316
-▁OLD 317
-MENT 318
-▁HARD 319
-▁THOSE 320
-CU 321
-AT 322
-▁INTEREST 323
-▁EVERY 324
-HE 325
-▁GIVE 326
-▁FACT 327
-▁FOUND 328
-▁HEAR 329
-▁NEVER 330
-▁CAN 331
-▁COME 332
-▁SORT 333
-ITION 334
-EST 335
-IGHT 336
-SHIP 337
-SIDE 338
-▁GREAT 339
-▁MAR 340
-▁GET 341
-▁OUT 342
-▁STA 343
-OP 344
-AIN 345
-▁PI 346
-RA 347
-ID 348
-▁DA 349
-▁LOOK 350
-END 351
-▁BA 352
-VO 353
-H 354
-▁LITTLE 355
-▁THEY 356
-▁ALWAYS 357
-CA 358
-▁HOUSE 359
-TIC 360
-IVE 361
-▁RU 362
-HO 363
-▁RIGHT 364
-ICAL 365
-HA 366
-▁AGAIN 367
-▁EXP 368
-IBLE 369
-▁CERTAIN 370
-▁SHE 371
-▁PRI 372
-▁PLAY 373
-▁VA 374
-LIC 375
-▁TWO 376
-▁THREE 377
-▁DOES 378
-▁YOU 379
-▁TALK 380
-▁ABOUT 381
-UT 382
-▁WITHOUT 383
-▁PEOPLE 384
-FF 385
-▁DON 386
-▁HAPPEN 387
-▁PERSON 388
-▁MADE 389
-▁PART 390
-▁HOW 391
-▁WHY 392
-▁MAKE 393
-▁HIM 394
-▁STATE 395
-▁GRA 396
-▁TRA 397
-FOR 398
-▁LIFE 399
-▁PER 400
-AM 401
-▁DU 402
-PORT 403
-IA 404
-▁BUSINESS 405
-▁UNDERSTAND 406
-▁PLAN 407
-▁KEEP 408
-▁GOT 409
-▁MONEY 410
-▁OWN 411
-▁MANY 412
-ONE 413
-▁BU 414
-▁REALLY 415
-▁COURSE 416
-▁THINGS 417
-▁SAID 418
-▁US 419
-▁WAY 420
-▁TAKE 421
-▁WORLD 422
-▁THING 423
-▁SAY 424
-▁TA 425
-▁SOMETHING 426
-▁LEARN 427
-DE 428
-▁QUESTION 429
-DAY 430
-▁NOW 431
-▁BACK 432
-▁YOUR 433
-ACH 434
-▁SPEAK 435
-▁TURN 436
-ATIVE 437
-▁OUR 438
-▁JE 439
-▁VI 440
-▁HI 441
-▁OP 442
-▁WEEK 443
-▁IDEA 444
-▁AWAY 445
-▁COUNT 446
-NESS 447
-▁REASON 448
-PH 449
-▁TWENTY 450
-▁QUITE 451
-▁CHANGE 452
-▁LOVE 453
-DUC 454
-▁SAME 455
-▁ENOUGH 456
-▁YES 457
-▁FEEL 458
-▁FIRST 459
-▁WHILE 460
-IZE 461
-▁ANYTHING 462
-▁DID 463
-▁KIND 464
-▁POWER 465
-▁JUST 466
-IF 467
-▁PH 468
-▁ANSWER 469
-▁FIND 470
-▁THANK 471
-▁BUILD 472
-▁GOING 473
-▁CAME 474
-▁TOGETHER 475
-▁IMPORTANT 476
-▁HELP 477
-▁FOUR 478
-▁DIFFERENT 479
-▁AROUND 480
-▁AMERICA 481
-▁ALSO 482
-▁NINE 483
-▁AH 484
-▁LOT 485
-▁BETWEEN 486
-▁START 487
-▁SCHOOL 488
-▁SYSTEM 489
-▁MAYBE 490
-▁ACTUALLY 491
-▁PROBLEM 492
-Q 493
-▁MR 494
-▁YEAH 495
-▁OKAY 496
-以 497
-后 498
-你 499
-是 500
-男 501
-孩 502
-子 503
-曾 504
-丽 505
-婷 506
-妈 507
-很 508
-想 509
-兰 510
-州 511
-哪 512
-有 513
-买 514
-路 515
-虎 516
-汽 517
-车 518
-的 519
-我 520
-家 521
-狗 522
-叫 523
-什 524
-么 525
-名 526
-字 527
-现 528
-在 529
-网 530
-络 531
-怎 532
-样 533
-鞍 534
-山 535
-到 536
-郑 537
-大 538
-巴 539
-上 540
-没 541
-厕 542
-所 543
-英 544
-文 545
-给 546
-一 547
-个 548
-惊 549
-喜 550
-看 551
-日 552
-程 553
-表 554
-形 555
-容 556
-人 557
-强 558
-势 559
-面 560
-前 561
-不 562
-敢 563
-吭 564
-声 565
-词 566
-儿 567
-些 568
-办 569
-法 570
-说 571
-话 572
-唱 573
-首 574
-老 575
-与 576
-海 577
-婆 578
-笨 579
-蛋 580
-为 581
-欢 582
-女 583
-呢 584
-杭 585
-里 586
-可 587
-打 588
-炮 589
-朋 590
-友 591
-啊 592
-对 593
-练 594
-语 595
-听 596
-力 597
-帮 598
-助 599
-歌 600
-请 601
-推 602
-荐 603
-几 604
-谢 605
-明 606
-天 607
-早 608
-七 609
-点 610
-起 611
-床 612
-时 613
-候 614
-睡 615
-觉 616
-会 617
-了 618
-吗 619
-查 620
-本 621
-地 622
-气 623
-公 624
-快 625
-吧 626
-注 627
-意 628
-安 629
-全 630
-要 631
-错 632
-过 633
-四 634
-川 635
-再 636
-讲 637
-笑 638
-好 639
-午 640
-连 641
-烟 642
-台 643
-轮 644
-船 645
-史 646
-记 647
-商 648
-君 649
-列 650
-传 651
-孙 652
-鞅 653
-乃 654
-遂 655
-西 656
-入 657
-秦 658
-翻 659
-译 660
-做 661
-下 662
-提 663
-醒 664
-发 665
-送 666
-排 667
-班 668
-邮 669
-件 670
-刘 671
-俊 672
-峰 673
-电 674
-播 675
-放 676
-曲 677
-最 678
-炫 679
-民 680
-族 681
-风 682
-还 683
-年 684
-中 685
-泰 686
-拉 687
-石 688
-光 689
-剑 690
-任 691
-务 692
-玛 693
-雅 694
-预 695
-言 696
-真 697
-视 698
-频 699
-爱 700
-范 701
-冰 702
-出 703
-演 704
-剧 705
-六 706
-三 707
-十 708
-分 709
-退 710
-当 711
-模 712
-式 713
-附 714
-近 715
-洗 716
-浴 717
-方 718
-交 719
-和 720
-玩 721
-见 722
-工 723
-作 724
-干 725
-就 726
-能 727
-疖 728
-吃 729
-饭 730
-或 731
-者 732
-其 733
-他 734
-东 735
-国 736
-移 737
-动 738
-通 739
-信 740
-限 741
-司 742
-介 743
-绍 744
-讯 745
-录 746
-载 747
-装 748
-跟 749
-位 750
-置 751
-离 752
-莞 753
-长 754
-站 755
-多 756
-远 757
-福 758
-建 759
-高 760
-速 761
-服 762
-少 763
-把 764
-这 765
-定 766
-成 767
-彩 768
-铃 769
-手 770
-机 771
-别 772
-忘 773
-盒 774
-拿 775
-回 776
-校 777
-区 778
-万 779
-口 780
-坐 781
-冷 782
-漠 783
-醉 784
-红 785
-颜 786
-来 787
-猪 788
-张 789
-波 790
-炎 791
-亚 792
-纶 793
-媳 794
-妇 795
-马 796
-志 797
-华 798
-短 799
-清 800
-图 801
-片 802
-生 803
-五 804
-钟 805
-开 806
-启 807
-蓝 808
-牙 809
-锂 810
-池 811
-初 812
-次 813
-充 814
-让 815
-今 816
-号 817
-顷 818
-等 819
-于 820
-平 821
-也 822
-呀 823
-聊 824
-问 825
-主 826
-结 827
-婚 828
-恭 829
-情 830
-流 831
-返 832
-洒 833
-热 834
-泪 835
-诗 836
-那 837
-去 838
-眠 839
-药 840
-功 841
-使 842
-用 843
-象 844
-间 845
-估 846
-计 847
-厚 848
-德 849
-物 850
-思 851
-搞 852
-们 853
-只 854
-知 855
-道 856
-奥 857
-特 858
-曼 859
-越 860
-野 861
-比 862
-较 863
-重 864
-新 865
-陈 866
-奕 867
-迅 868
-泉 869
-湾 870
-票 871
-呵 872
-活 873
-经 874
-历 875
-摇 876
-痴 877
-此 878
-刻 879
-呈 880
-报 881
-脑 882
-总 883
-着 884
-她 885
-阿 886
-斯 887
-顿 888
-丁 889
-影 890
-终 891
-幻 892
-雨 893
-否 894
-带 895
-伞 896
-休 897
-息 898
-值 899
-百 900
-每 901
-月 902
-找 903
-扎 904
-屯 905
-线 906
-仙 907
-奇 908
-侠 909
-业 910
-写 911
-完 912
-姐 913
-稍 914
-烤 915
-鹅 916
-肠 917
-团 918
-狩 919
-猎 920
-美 921
-忍 922
-向 923
-冲 924
-常 925
-熟 926
-度 927
-折 928
-扣 929
-二 930
-乘 931
-教 932
-实 933
-认 934
-证 935
-考 936
-试 937
-答 938
-案 939
-费 940
-脸 941
-自 942
-恋 943
-广 944
-府 945
-待 946
-赶 947
-集 948
-告 949
-诉 950
-太 951
-懂 952
-坏 953
-泡 954
-妞 955
-伊 956
-更 957
-健 958
-康 959
-检 960
-乙 961
-肝 962
-厦 963
-门 964
-急 965
-控 966
-毒 967
-产 968
-头 969
-脚 970
-轻 971
-感 972
-空 973
-订 974
-从 975
-碑 976
-店 977
-北 978
-京 979
-火 980
-鸡 981
-白 982
-态 983
-啥 984
-幺 985
-哈 986
-尔 987
-滨 988
-至 989
-硬 990
-座 991
-换 992
-姿 993
-亲 994
-级 995
-紫 996
-秋 997
-叶 998
-刀 999
-爆 1000
-率 1001
-馨 1002
-予 1003
-帅 1004
-汇 1005
-询 1006
-小 1007
-唉 1008
-性 1009
-挺 1010
-厉 1011
-害 1012
-八 1013
-米 1014
-跑 1015
-步 1016
-达 1017
-标 1018
-规 1019
-准 1020
-码 1021
-音 1022
-拨 1023
-器 1024
-油 1025
-价 1026
-卡 1027
-尿 1028
-防 1029
-杀 1030
-死 1031
-倍 1032
-晋 1033
-映 1034
-部 1035
-韩 1036
-己 1037
-右 1038
-键 1039
-编 1040
-辑 1041
-您 1042
-房 1043
-租 1044
-具 1045
-体 1046
-牌 1047
-瑞 1048
-士 1049
-浪 1050
-琴 1051
-简 1052
-永 1053
-拥 1054
-灿 1055
-烂 1056
-朝 1057
-阳 1058
-解 1059
-赢 1060
-事 1061
-滚 1062
-周 1063
-吕 1064
-忠 1065
-页 1066
-心 1067
-份 1068
-被 1069
-龙 1070
-肖 1071
-半 1072
-喊 1073
-改 1074
-堵 1075
-饿 1076
-瞌 1077
-叔 1078
-戏 1079
-兴 1080
-化 1081
-备 1082
-萍 1083
-乡 1084
-钓 1085
-鱼 1086
-岛 1087
-消 1088
-忻 1089
-襄 1090
-县 1091
-况 1092
-得 1093
-王 1094
-属 1095
-假 1096
-期 1097
-像 1098
-脾 1099
-零 1100
-九 1101
-关 1102
-晚 1103
-陪 1104
-危 1105
-苦 1106
-难 1107
-例 1108
-顺 1109
-序 1110
-盖 1111
-茨 1112
-邦 1113
-故 1114
-警 1115
-戒 1116
-搜 1117
-索 1118
-黄 1119
-照 1120
-底 1121
-识 1122
-武 1123
-汉 1124
-第 1125
-博 1126
-基 1127
-尼 1128
-删 1129
-除 1130
-沃 1131
-狐 1132
-场 1133
-卖 1134
-便 1135
-宜 1136
-营 1137
-厅 1138
-资 1139
-料 1140
-哥 1141
-句 1142
-员 1143
-随 1144
-导 1145
-航 1146
-学 1147
-星 1148
-脱 1149
-毛 1150
-膏 1151
-种 1152
-乐 1153
-贵 1154
-妃 1155
-酒 1156
-内 1157
-条 1158
-联 1159
-系 1160
-希 1161
-望 1162
-非 1163
-苹 1164
-果 1165
-银 1166
-登 1167
-户 1168
-密 1169
-师 1170
-沟 1171
-莫 1172
-才 1173
-喝 1174
-累 1175
-彭 1176
-裘 1177
-莹 1178
-珠 1179
-徐 1180
-管 1181
-爸 1182
-敏 1183
-腰 1184
-闪 1185
-舟 1186
-凯 1187
-肯 1188
-水 1189
-济 1190
-闻 1191
-约 1192
-伦 1193
-乱 1194
-舞 1195
-春 1196
-叮 1197
-咚 1198
-昨 1199
-又 1200
-宵 1201
-省 1202
-途 1203
-无 1204
-院 1205
-吉 1206
-农 1207
-股 1208
-胶 1209
-布 1210
-谁 1211
-免 1212
-疫 1213
-逗 1214
-闹 1215
-闭 1216
-青 1217
-景 1218
-花 1219
-园 1220
-富 1221
-桥 1222
-哭 1223
-节 1224
-树 1225
-茂 1226
-盛 1227
-边 1228
-余 1229
-姚 1230
-走 1231
-原 1232
-行 1233
-都 1234
-旅 1235
-馆 1236
-宾 1237
-根 1238
-修 1239
-理 1240
-厂 1241
-板 1242
-诊 1243
-专 1244
-复 1245
-鹰 1246
-队 1247
-桌 1248
-财 1249
-港 1250
-色 1251
-诚 1252
-勿 1253
-扰 1254
-持 1255
-孟 1256
-古 1257
-医 1258
-研 1259
-究 1260
-取 1261
-卫 1262
-目 1263
-铁 1264
-麻 1265
-将 1266
-浏 1267
-览 1268
-湿 1269
-朱 1270
-沱 1271
-合 1272
-江 1273
-何 1274
-祈 1275
-祷 1276
-义 1277
-酷 1278
-派 1279
-如 1280
-咩 1281
-正 1282
-算 1283
-楼 1284
-距 1285
-震 1286
-借 1287
-政 1288
-策 1289
-温 1290
-宝 1291
-沙 1292
-角 1293
-肚 1294
-疼 1295
-林 1296
-宿 1297
-舍 1298
-阴 1299
-晴 1300
-圆 1301
-缺 1302
-微 1303
-辽 1304
-刚 1305
-牛 1306
-偷 1307
-吓 1308
-跳 1309
-城 1310
-吴 1311
-称 1312
-呼 1313
-爷 1314
-埋 1315
-怨 1316
-缠 1317
-柴 1318
-钱 1319
-极 1320
-先 1321
-辞 1322
-职 1323
-哦 1324
-啦 1325
-售 1326
-保 1327
-黑 1328
-客 1329
-立 1330
-足 1331
-之 1332
-鬼 1333
-留 1334
-辟 1335
-邪 1336
-谱 1337
-减 1338
-肥 1339
-皇 1340
-萨 1341
-舅 1342
-幽 1343
-默 1344
-细 1345
-胞 1346
-溪 1347
-懒 1348
-书 1349
-杨 1350
-慧 1351
-芝 1352
-屁 1353
-画 1354
-晨 1355
-宁 1356
-侦 1357
-探 1358
-柯 1359
-南 1360
-涛 1361
-应 1362
-该 1363
-弑 1364
-神 1365
-魔 1366
-创 1367
-世 1368
-游 1369
-犯 1370
-已 1371
-泽 1372
-村 1373
-变 1374
-奏 1375
-杰 1376
-偶 1377
-命 1378
-乔 1379
-恩 1380
-并 1381
-阅 1382
-读 1383
-左 1384
-般 1385
-低 1386
-调 1387
-阔 1388
-烦 1389
-球 1390
-峡 1391
-界 1392
-霆 1393
-锋 1394
-柏 1395
-需 1396
-艺 1397
-术 1398
-弄 1399
-章 1400
-悲 1401
-咒 1402
-版 1403
-统 1404
-羊 1405
-肉 1406
-斤 1407
-嘿 1408
-郁 1409
-闷 1410
-进 1411
-姨 1412
-庆 1413
-威 1414
-两 1415
-岸 1416
-普 1417
-琪 1418
-玫 1419
-瑰 1420
-香 1421
-碟 1422
-眼 1423
-收 1424
-湖 1425
-禽 1426
-涮 1427
-汪 1428
-盘 1429
-夹 1430
-存 1431
-枕 1432
-指 1433
-针 1434
-仓 1435
-库 1436
-餐 1437
-块 1438
-咪 1439
-毕 1440
-薪 1441
-赛 1442
-纪 1443
-末 1444
-典 1445
-娱 1446
-傻 1447
-嘲 1448
-彪 1449
-升 1450
-润 1451
-核 1452
-遍 1453
-接 1454
-元 1455
-鼓 1456
-屿 1457
-爹 1458
-胡 1459
-雪 1460
-招 1461
-喂 1462
-齿 1463
-侣 1464
-土 1465
-豆 1466
-铿 1467
-锵 1468
-聪 1469
-但 1470
-飞 1471
-鹤 1472
-壁 1473
-摞 1474
-久 1475
-冬 1476
-骏 1477
-然 1478
-讨 1479
-论 1480
-腾 1481
-易 1482
-筋 1483
-转 1484
-弯 1485
-耳 1486
-齐 1487
-阶 1488
-革 1489
-代 1490
-许 1491
-圣 1492
-诞 1493
-吸 1494
-血 1495
-燕 1496
-松 1497
-鼠 1498
-确 1499
-凤 1500
-凰 1501
-由 1502
-翔 1503
-段 1504
-超 1505
-精 1506
-支 1507
-扶 1508
-室 1509
-包 1510
-菜 1511
-田 1512
-骂 1513
-洋 1514
-舒 1515
-衡 1516
-款 1517
-嘴 1518
-菲 1519
-嘛 1520
-嗨 1521
-鸟 1522
-玉 1523
-数 1524
-贝 1525
-郝 1526
-漫 1527
-诺 1528
-衣 1529
-嫖 1530
-娼 1531
-穿 1532
-骨 1533
-拆 1534
-伤 1535
-获 1536
-奖 1537
-稿 1538
-锅 1539
-购 1540
-治 1541
-痘 1542
-遗 1543
-疤 1544
-痕 1545
-饱 1546
-寻 1547
-瓷 1548
-妹 1549
-徽 1550
-参 1551
-格 1552
-题 1553
-凭 1554
-住 1555
-剖 1556
-腹 1557
-祝 1558
-贺 1559
-加 1560
-跪 1561
-潞 1562
-云 1563
-端 1564
-木 1565
-烁 1566
-朵 1567
-赵 1568
-潘 1569
-津 1570
-滋 1571
-燥 1572
-利 1573
-负 1574
-昆 1575
-因 1576
-森 1577
-及 1578
-病 1579
-固 1580
-市 1581
-烧 1582
-番 1583
-茄 1584
-炒 1585
-而 1586
-凉 1587
-冒 1588
-量 1589
-夸 1590
-尾 1591
-崔 1592
-另 1593
-处 1594
-铺 1595
-沈 1596
-哎 1597
-身 1598
-哟 1599
-习 1600
-虞 1601
-瞧 1602
-烈 1603
-皮 1604
-鞋 1605
-深 1606
-圳 1607
-委 1608
-胖 1609
-猴 1610
-军 1611
-素 1612
-楷 1613
-补 1614
-偿 1615
-屏 1616
-散 1617
-效 1618
-丹 1619
-念 1620
-绝 1621
-艳 1622
-够 1623
-狼 1624
-且 1625
-龄 1626
-乌 1627
-蓉 1628
-厘 1629
-含 1630
-庚 1631
-澈 1632
-犬 1633
-致 1634
-运 1635
-慢 1636
-钻 1637
-李 1638
-轩 1639
-育 1640
-项 1641
-咱 1642
-误 1643
-弟 1644
-依 1645
-尽 1646
-河 1647
-夫 1648
-沁 1649
-始 1650
-芳 1651
-禺 1652
-旧 1653
-坑 1654
-胜 1655
-酱 1656
-漂 1657
-亮 1658
-战 1659
-斗 1660
-严 1661
-娟 1662
-逼 1663
-添 1664
-盈 1665
-萝 1666
-莉 1667
-肌 1668
-唐 1669
-兵 1670
-辆 1671
-双 1672
-佛 1673
-傅 1674
-劲 1675
-直 1676
-测 1677
-苏 1678
-迁 1679
-沭 1680
-祥 1681
-婴 1682
-品 1683
-销 1684
-禹 1685
-哲 1686
-嗯 1687
-趟 1688
-拐 1689
-金 1690
-满 1691
-套 1692
-倒 1693
-千 1694
-迎 1695
-淇 1696
-驾 1697
-拟 1698
-良 1699
-揭 1700
-杯 1701
-淄 1702
-睛 1703
-制 1704
-枪 1705
-抢 1706
-狸 1707
-泥 1708
-造 1709
-哇 1710
-羯 1711
-庙 1712
-逃 1713
-朗 1714
-领 1715
-悟 1716
-湛 1717
-贸 1718
-垃 1719
-圾 1720
-软 1721
-莲 1722
-味 1723
-旺 1724
-旦 1725
-潮 1726
-奶 1727
-央 1728
-惜 1729
-续 1730
-咨 1731
-茅 1732
-父 1733
-母 1734
-笔 1735
-封 1736
-同 1737
-黎 1738
-共 1739
-科 1740
-相 1741
-镇 1742
-贤 1743
-宏 1744
-洲 1745
-瓦 1746
-寨 1747
-受 1748
-梦 1749
-呗 1750
-苍 1751
-丘 1752
-避 1753
-孕 1754
-灰 1755
-尤 1756
-击 1757
-腮 1758
-腺 1759
-兆 1760
-坨 1761
-屎 1762
-忧 1763
-草 1764
-赠 1765
-外 1766
-险 1767
-某 1768
-磁 1769
-贷 1770
-反 1771
-罚 1772
-昌 1773
-饰 1774
-辉 1775
-权 1776
-材 1777
-炯 1778
-签 1779
-追 1780
-求 1781
-催 1782
-痛 1783
-盱 1784
-眙 1785
-围 1786
-淘 1787
-幸 1788
-扫 1789
-旭 1790
-切 1791
-磋 1792
-冠 1793
-征 1794
-郎 1795
-骗 1796
-哄 1797
-释 1798
-蒙 1799
-涯 1800
-碧 1801
-斜 1802
-嫁 1803
-幕 1804
-哼 1805
-钢 1806
-碳 1807
-纤 1808
-维 1809
-决 1810
-妻 1811
-未 1812
-绳 1813
-断 1814
-寺 1815
-妙 1816
-伯 1817
-蹲 1818
-宅 1819
-吵 1820
-娃 1821
-兄 1822
-廷 1823
-夺 1824
-社 1825
-示 1826
-按 1827
-饺 1828
-甜 1829
-蜜 1830
-咖 1831
-啡 1832
-优 1833
-逛 1834
-街 1835
-著 1836
-杂 1837
-址 1838
-荷 1839
-塘 1840
-拜 1841
-币 1842
-迟 1843
-货 1844
-粉 1845
-刮 1846
-破 1847
-射 1848
-狂 1849
-苗 1850
-罗 1851
-设 1852
-困 1853
-湘 1854
-潭 1855
-评 1856
-娘 1857
-涉 1858
-采 1859
-芙 1860
-夜 1861
-捷 1862
-斩 1863
-摩 1864
-托 1865
-泳 1866
-琳 1867
-律 1868
-官 1869
-劫 1870
-蛮 1871
-替 1872
-架 1873
-悬 1874
-浮 1875
-窗 1876
-顶 1877
-敦 1878
-善 1879
-哉 1880
-桂 1881
-勇 1882
-荆 1883
-镜 1884
-监 1885
-怕 1886
-呐 1887
-劳 1888
-莱 1889
-狮 1890
-宽 1891
-袋 1892
-囊 1893
-秀 1894
-卸 1895
-链 1896
-嵩 1897
-韭 1898
-葆 1899
-额 1900
-翼 1901
-忙 1902
-瓶 1903
-梅 1904
-堰 1905
-粒 1906
-汤 1907
-谋 1908
-樊 1909
-恨 1910
-愿 1911
-锡 1912
-申 1913
-护 1914
-庄 1915
-临 1916
-源 1917
-环 1918
-境 1919
-礼 1920
-恐 1921
-晒 1922
-虫 1923
-划 1924
-鸣 1925
-怖 1926
-伍 1927
-佰 1928
-岁 1929
-组 1930
-响 1931
-类 1932
-韶 1933
-克 1934
-洛 1935
-玲 1936
-裤 1937
-柔 1938
-疆 1939
-篮 1940
-伟 1941
-扔 1942
-掉 1943
-媒 1944
-涨 1945
-透 1946
-纯 1947
-怀 1948
-坊 1949
-麦 1950
-菠 1951
-养 1952
-晕 1953
-群 1954
-展 1955
-厌 1956
-拒 1957
-单 1958
-静 1959
-刷 1960
-插 1961
-肛 1962
-互 1963
-蘑 1964
-菇 1965
-姑 1966
-桐 1967
-辛 1968
-察 1969
-毫 1970
-质 1971
-差 1972
-翰 1973
-爽 1974
-欣 1975
-议 1976
-铜 1977
-籍 1978
-争 1979
-喆 1980
-孔 1981
-堤 1982
-薇 1983
-茵 1984
-席 1985
-琼 1986
-杠 1987
-衔 1988
-概 1989
-往 1990
-邢 1991
-惠 1992
-烫 1993
-绑 1994
-崇 1995
-帝 1996
-据 1997
-貌 1998
-似 1999
-胸 2000
-罩 2001
-构 2002
-尊 2003
-秘 2004
-它 2005
-详 2006
-悠 2007
-闲 2008
-违 2009
-陆 2010
-割 2011
-绩 2012
-企 2013
-绥 2014
-辐 2015
-舌 2016
-寂 2017
-寞 2018
-宇 2019
-携 2020
-拳 2021
-观 2022
-魏 2023
-郭 2024
-磊 2025
-副 2026
-梁 2027
-斌 2028
-须 2029
-僧 2030
-徒 2031
-季 2032
-灯 2033
-梯 2034
-墙 2035
-付 2036
-坦 2037
-殊 2038
-曹 2039
-操 2040
-捡 2041
-赤 2042
-盗 2043
-废 2044
-蒋 2045
-浙 2046
-食 2047
-咯 2048
-童 2049
-坡 2050
-剪 2051
-唯 2052
-疗 2053
-状 2054
-暴 2055
-缓 2056
-誉 2057
-衰 2058
-宋 2059
-娜 2060
-雄 2061
-谛 2062
-糖 2063
-羽 2064
-棋 2065
-滩 2066
-佳 2067
-臭 2068
-帆 2069
-岳 2070
-疲 2071
-惫 2072
-滴 2073
-倾 2074
-盆 2075
-谷 2076
-施 2077
-晶 2078
-赚 2079
-澡 2080
-遇 2081
-鲁 2082
-祭 2083
-灶 2084
-独 2085
-谈 2086
-承 2087
-蜀 2088
-丰 2089
-归 2090
-辜 2091
-扇 2092
-渴 2093
-羡 2094
-慕 2095
-裸 2096
-宗 2097
-纬 2098
-亦 2099
-儒 2100
-霸 2101
-翡 2102
-翠 2103
-芭 2104
-抱 2105
-歉 2106
-邱 2107
-夏 2108
-隆 2109
-灵 2110
-珍 2111
-浩 2112
-乾 2113
-坤 2114
-培 2115
-训 2116
-压 2117
-偏 2118
-骤 2119
-熙 2120
-葬 2121
-姆 2122
-兽 2123
-筑 2124
-丝 2125
-若 2126
-诡 2127
-异 2128
-侯 2129
-摆 2130
-俗 2131
-缚 2132
-束 2133
-愁 2134
-盟 2135
-却 2136
-显 2137
-肤 2138
-茹 2139
-荣 2140
-增 2141
-宫 2142
-局 2143
-适 2144
-楚 2145
-驻 2146
-纽 2147
-秒 2148
-辣 2149
-虾 2150
-甘 2151
-肃 2152
-粕 2153
-喻 2154
-敬 2155
-谨 2156
-慎 2157
-竭 2158
-止 2159
-际 2160
-寓 2161
-勤 2162
-挫 2163
-泣 2164
-奈 2165
-圭 2166
-焰 2167
-猩 2168
-守 2169
-允 2170
-兔 2171
-篇 2172
-敌 2173
-辕 2174
-猫 2175
-柠 2176
-檬 2177
-橘 2178
-卜 2179
-妓 2180
-既 2181
-闯 2182
-胆 2183
-刁 2184
-竟 2185
-竞 2186
-冯 2187
-陇 2188
-赣 2189
-呆 2190
-滞 2191
-停 2192
-邯 2193
-郸 2194
-域 2195
-徕 2196
-患 2197
-甲 2198
-亡 2199
-鼻 2200
-背 2201
-戴 2202
-幼 2203
-伙 2204
-括 2205
-邓 2206
-谐 2207
-担 2208
-浑 2209
-抖 2210
-耍 2211
-综 2212
-失 2213
-蕾 2214
-鸭 2215
-莘 2216
-选 2217
-糸 2218
-桶 2219
-弃 2220
-暗 2221
-卓 2222
-榜 2223
-拼 2224
-壅 2225
-丈 2226
-锈 2227
-恢 2228
-刺 2229
-嘻 2230
-顾 2231
-投 2232
-晓 2233
-巨 2234
-抽 2235
-档 2236
-乳 2237
-迪 2238
-蠢 2239
-裹 2240
-唤 2241
-焦 2242
-择 2243
-俺 2244
-技 2245
-暧 2246
-昧 2247
-怪 2248
-坛 2249
-眉 2250
-嘉 2251
-逸 2252
-课 2253
-栏 2254
-撬 2255
-框 2256
-液 2257
-凝 2258
-暑 2259
-型 2260
-烘 2261
-簿 2262
-扬 2263
-汁 2264
-诸 2265
-迹 2266
-禁 2267
-株 2268
-泸 2269
-屠 2270
-宰 2271
-忽 2272
-炼 2273
-必 2274
-妆 2275
-飘 2276
-鹿 2277
-敲 2278
-拾 2279
-躺 2280
-歇 2281
-狠 2282
-沾 2283
-畅 2284
-镕 2285
-贪 2286
-污 2287
-斧 2288
-巡 2289
-弹 2290
-盐 2291
-枝 2292
-渝 2293
-壑 2294
-郴 2295
-落 2296
-牡 2297
-钛 2298
-剩 2299
-俄 2300
-抵 2301
-押 2302
-郊 2303
-弱 2304
-授 2305
-蟹 2306
-糕 2307
-败 2308
-各 2309
-伴 2310
-享 2311
-居 2312
-障 2313
-棕 2314
-旁 2315
-屌 2316
-绒 2317
-酸 2318
-隔 2319
-瞄 2320
-俩 2321
-柳 2322
-册 2323
-弊 2324
-逮 2325
-绵 2326
-挣 2327
-闵 2328
-勒 2329
-陶 2330
-寒 2331
-吻 2332
-桃 2333
-悍 2334
-绮 2335
-贞 2336
-疾 2337
-诫 2338
-菌 2339
-则 2340
-谭 2341
-咏 2342
-麟 2343
-棠 2344
-抬 2345
-棺 2346
-均 2347
-纸 2348
-碱 2349
-沧 2350
-董 2351
-挤 2352
-虚 2353
-钠 2354
-胃 2355
-躁 2356
-智 2357
-畔 2358
-墨 2359
-堂 2360
-喔 2361
-宣 2362
-丑 2363
-嚣 2364
-辈 2365
-孤 2366
-鞭 2367
-验 2368
-夕 2369
-印 2370
-欧 2371
-阵 2372
-咋 2373
-驹 2374
-挂 2375
-轿 2376
-拍 2377
-洁 2378
-凑 2379
-蕉 2380
-诱 2381
-惑 2382
-颐 2383
-箭 2384
-樱 2385
-辖 2386
-捕 2387
-炸 2388
-斋 2389
-恒 2390
-沉 2391
-侧 2392
-跌 2393
-暇 2394
-掌 2395
-筷 2396
-彬 2397
-稚 2398
-傲 2399
-腔 2400
-藏 2401
-浦 2402
-瓣 2403
-捆 2404
-卧 2405
-欠 2406
-犀 2407
-甩 2408
-敷 2409
-衍 2410
-谅 2411
-积 2412
-怡 2413
-阁 2414
-趣 2415
-掰 2416
-耽 2417
-蒜 2418
-菱 2419
-葛 2420
-聚 2421
-露 2422
-帐 2423
-紧 2424
-郓 2425
-聘 2426
-桑 2427
-众 2428
-圈 2429
-渡 2430
-鲜 2431
-杜 2432
-甫 2433
-遵 2434
-骚 2435
-吹 2436
-蚊 2437
-塔 2438
-赏 2439
-荒 2440
-欺 2441
-揍 2442
-锁 2443
-恼 2444
-忐 2445
-忑 2446
-输 2447
-描 2448
-触 2449
-糊 2450
-涂 2451
-熊 2452
-妮 2453
-抄 2454
-裙 2455
-塑 2456
-橡 2457
-阜 2458
-獒 2459
-励 2460
-黔 2461
-臣 2462
-憔 2463
-悴 2464
-昂 2465
-党 2466
-沐 2467
-浓 2468
-灾 2469
-捐 2470
-柿 2471
-瑟 2472
-翁 2473
-侨 2474
-督 2475
-振 2476
-鹏 2477
-乒 2478
-乓 2479
-巷 2480
-贡 2481
-祖 2482
-即 2483
-绿 2484
-搭 2485
-配 2486
-骑 2487
-届 2488
-举 2489
-伸 2490
-整 2491
-突 2492
-陌 2493
-糟 2494
-惩 2495
-硕 2496
-茫 2497
-趋 2498
-仁 2499
-钙 2500
-雕 2501
-井 2502
-撒 2503
-岩 2504
-悄 2505
-搁 2506
-浅 2507
-救 2508
-饮 2509
-佩 2510
-赌 2511
-涕 2512
-薯 2513
-令 2514
-泌 2515
-蔬 2516
-批 2517
-攀 2518
-怜 2519
-淮 2520
-寝 2521
-填 2522
-卿 2523
-萱 2524
-寄 2525
-窝 2526
-纳 2527
-洱 2528
-惹 2529
-锦 2530
-浒 2531
-欲 2532
-棉 2533
-箱 2534
-仅 2535
-述 2536
-摸 2537
-纲 2538
-澳 2539
-染 2540
-兼 2541
-岭 2542
-淋 2543
-肿 2544
-旗 2545
-嘞 2546
-乖 2547
-酮 2548
-颠 2549
-覆 2550
-誓 2551
-递 2552
-蛟 2553
-占 2554
-乎 2555
-融 2556
-甸 2557
-幂 2558
-钥 2559
-匙 2560
-酬 2561
-皆 2562
-胎 2563
-腐 2564
-痿 2565
-绣 2566
-枫 2567
-蝴 2568
-蝶 2569
-抛 2570
-撞 2571
-植 2572
-僵 2573
-尸 2574
-巾 2575
-煌 2576
-逊 2577
-引 2578
-兑 2579
-荫 2580
-朔 2581
-丢 2582
-扩 2583
-摄 2584
-龟 2585
-鑫 2586
-谦 2587
-豪 2588
-噬 2589
-眷 2590
-挑 2591
-仲 2592
-穷 2593
-玻 2594
-璃 2595
-岗 2596
-姥 2597
-横 2598
-蚌 2599
-埠 2600
-邀 2601
-蔚 2602
-虹 2603
-降 2604
-疣 2605
-鱿 2606
-喵 2607
-囧 2608
-茶 2609
-猜 2610
-玮 2611
-莎 2612
-冼 2613
-榕 2614
-媛 2615
-瓜 2616
-煮 2617
-耕 2618
-镶 2619
-虽 2620
-驳 2621
-霍 2622
-仗 2623
-窍 2624
-魅 2625
-访 2626
-邻 2627
-抗 2628
-莆 2629
-涵 2630
-筒 2631
-疯 2632
-赖 2633
-豌 2634
-碍 2635
-症 2636
-卤 2637
-翅 2638
-膀 2639
-蓬 2640
-咸 2641
-尚 2642
-瘦 2643
-缸 2644
-爬 2645
-鄂 2646
-塞 2647
-稻 2648
-召 2649
-荡 2650
-桨 2651
-税 2652
-呃 2653
-渠 2654
-骥 2655
-伏 2656
-枥 2657
-邑 2658
-净 2659
-弦 2660
-蔽 2661
-诀 2662
-咳 2663
-嗽 2664
-芯 2665
-储 2666
-缘 2667
-冻 2668
-厨 2669
-鉴 2670
-擦 2671
-棒 2672
-损 2673
-暂 2674
-殖 2675
-焊 2676
-募 2677
-邵 2678
-饶 2679
-梭 2680
-鄙 2681
-骄 2682
-蔡 2683
-辄 2684
-努 2685
-洽 2686
-宙 2687
-鲈 2688
-葫 2689
-芦 2690
-梧 2691
-燎 2692
-缴 2693
-薄 2694
-执 2695
-垫 2696
-靠 2697
-拢 2698
-萧 2699
-醋 2700
-脊 2701
-慰 2702
-攻 2703
-狱 2704
-吝 2705
-啬 2706
-煤 2707
-楞 2708
-脏 2709
-迷 2710
-椒 2711
-侄 2712
-璇 2713
-耐 2714
-庵 2715
-帽 2716
-崎 2717
-峻 2718
-援 2719
-娶 2720
-丫 2721
-犹 2722
-豫 2723
-罪 2724
-恶 2725
-陛 2726
-樟 2727
-截 2728
-巧 2729
-驰 2730
-轨 2731
-继 2732
-葱 2733
-蘸 2734
-汕 2735
-蜘 2736
-蛛 2737
-聋 2738
-俱 2739
-捉 2740
-卢 2741
-骆 2742
-氓 2743
-耶 2744
-仔 2745
-激 2746
-渊 2747
-钾 2748
-暖 2749
-钰 2750
-裁 2751
-判 2752
-略 2753
-墓 2754
-洪 2755
-凌 2756
-符 2757
-壮 2758
-陵 2759
-挥 2760
-夷 2761
-尘 2762
-沪 2763
-榆 2764
-涧 2765
-析 2766
-孝 2767
-弘 2768
-椅 2769
-贴 2770
-蛇 2771
-浣 2772
-镯 2773
-枣 2774
-佐 2775
-柑 2776
-谓 2777
-洞 2778
-漳 2779
-撕 2780
-叉 2781
-诛 2782
-糯 2783
-粽 2784
-碎 2785
-幅 2786
-赘 2787
-浆 2788
-循 2789
-偕 2790
-诙 2791
-阚 2792
-摘 2793
-串 2794
-悉 2795
-蜕 2796
-残 2797
-诅 2798
-祁 2799
-仪 2800
-璐 2801
-瑶 2802
-楠 2803
-崂 2804
-供 2805
-掖 2806
-椎 2807
-铆 2808
-钉 2809
-铐 2810
-镣 2811
-栋 2812
-潇 2813
-抓 2814
-屋 2815
-鸦 2816
-玄 2817
-芜 2818
-钨 2819
-毯 2820
-矿 2821
-缩 2822
-酶 2823
-焕 2824
-埃 2825
-霞 2826
-噢 2827
-韵 2828
-艾 2829
-虐 2830
-俘 2831
-颗 2832
-巩 2833
-牵 2834
-汝 2835
-搅 2836
-廉 2837
-啤 2838
-苑 2839
-辍 2840
-缝 2841
-纫 2842
-膜 2843
-娄 2844
-倩 2845
-魂 2846
-姜 2847
-彻 2848
-扉 2849
-镁 2850
-氢 2851
-铝 2852
-淀 2853
-雹 2854
-妍 2855
-鼎 2856
-碗 2857
-亭 2858
-闽 2859
-献 2860
-耻 2861
-畜 2862
-蚯 2863
-蚓 2864
-杆 2865
-靓 2866
-颖 2867
-瘾 2868
-腿 2869
-咧 2870
-嗦 2871
-忆 2872
-卑 2873
-鸽 2874
-藤 2875
-滑 2876
-蝇 2877
-蚂 2878
-蚁 2879
-迢 2880
-蝎 2881
-斑 2882
-赞 2883
-氧 2884
-姝 2885
-擎 2886
-憋 2887
-屈 2888
-讶 2889
-袜 2890
-吐 2891
-秽 2892
-哑 2893
-蓦 2894
-阑 2895
-珊 2896
-披 2897
-巫 2898
-妖 2899
-坪 2900
-疏 2901
-抒 2902
-炉 2903
-舰 2904
-贱 2905
-搬 2906
-遥 2907
-燃 2908
-咽 2909
-喉 2910
-熔 2911
-婵 2912
-奔 2913
-汗 2914
-蓄 2915
-辰 2916
-肩 2917
-洮 2918
-琅 2919
-径 2920
-廊 2921
-姬 2922
-衬 2923
-雯 2924
-滁 2925
-泗 2926
-筹 2927
-诵 2928
-奴 2929
-跨 2930
-娴 2931
-绯 2932
-惯 2933
-谎 2934
-蹈 2935
-潜 2936
-搂 2937
-逆 2938
-钞 2939
-辅 2940
-凶 2941
-橱 2942
-柜 2943
-婕 2944
-矮 2945
-邹 2946
-嫂 2947
-饼 2948
-撸 2949
-壶 2950
-握 2951
-鸳 2952
-鸯 2953
-寸 2954
-堕 2955
-哀 2956
-械 2957
-蜈 2958
-蚣 2959
-袁 2960
-鸿 2961
-穆 2962
-泊 2963
-衮 2964
-弗 2965
-雷 2966
-谜 2967
-俞 2968
-灏 2969
-毅 2970
-迈 2971
-蜂 2972
-辨 2973
-沂 2974
-灭 2975
-腊 2976
-脍 2977
-炙 2978
-卦 2979
-霄 2980
-扯 2981
-泾 2982
-脂 2983
-肪 2984
-淹 2985
-灌 2986
-辱 2987
-丸 2988
-账 2989
-秤 2990
-褐 2991
-芬 2992
-窖 2993
-慈 2994
-益 2995
-亿 2996
-颈 2997
-糜 2998
-隋 2999
-霉 3000
-署 3001
-狄 3002
-酪 3003
-旋 3004
-蔷 3005
-皱 3006
-纹 3007
-枯 3008
-粤 3009
-拔 3010
-菩 3011
-驱 3012
-咦 3013
-掀 3014
-菊 3015
-涩 3016
-耀 3017
-娥 3018
-奘 3019
-眯 3020
-芊 3021
-绪 3022
-沛 3023
-锐 3024
-姓 3025
-氏 3026
-垂 3027
-迫 3028
-絮 3029
-藕 3030
-捎 3031
-蓓 3032
-沫 3033
-奎 3034
-贩 3035
-泵 3036
-疑 3037
-岐 3038
-拓 3039
-詹 3040
-韦 3041
-粥 3042
-瞎 3043
-层 3044
-寿 3045
-淑 3046
-琦 3047
-履 3048
-痣 3049
-蔓 3050
-延 3051
-措 3052
-氰 3053
-胺 3054
-炭 3055
-鸥 3056
-谊 3057
-宛 3058
-悦 3059
-谣 3060
-茎 3061
-堆 3062
-鲤 3063
-坚 3064
-澎 3065
-溜 3066
-贫 3067
-擅 3068
-锌 3069
-竹 3070
-苟 3071
-磷 3072
-庐 3073
-嵌 3074
-潍 3075
-悚 3076
-岑 3077
-稀 3078
-奋 3079
-呦 3080
-梳 3081
-伐 3082
-芒 3083
-吶 3084
-凡 3085
-臂 3086
-驴 3087
-殿 3088
-雁 3089
-粹 3090
-凋 3091
-葵 3092
-烛 3093
-肾 3094
-尝 3095
-磨 3096
-晏 3097
-甄 3098
-嬛 3099
-盼 3100
-肇 3101
-咬 3102
-洼 3103
-匪 3104
-啰 3105
-硅 3106
-铅 3107
-矛 3108
-盾 3109
-贼 3110
-霜 3111
-螺 3112
-漏 3113
-帕 3114
-杉 3115
-矶 3116
-耗 3117
-责 3118
-靖 3119
-呸 3120
-驶 3121
-吞 3122
-睁 3123
-笼 3124
-茉 3125
-赈 3126
-纱 3127
-艘 3128
-炖 3129
-仿 3130
-瞬 3131
-嵊 3132
-澄 3133
-丞 3134
-摔 3135
-宠 3136
-爪 3137
-笋 3138
-庭 3139
-蜡 3140
-戈 3141
-锻 3142
-粗 3143
-糙 3144
-混 3145
-荚 3146
-曙 3147
-凄 3148
-抚 3149
-瀑 3150
-挖 3151
-掘 3152
-垮 3153
-奢 3154
-侈 3155
-揉 3156
-穹 3157
-钦 3158
-蛙 3159
-荧 3160
-悔 3161
-彦 3162
-忏 3163
-祸 3164
-攒 3165
-慌 3166
-簸 3167
-箕 3168
-繁 3169
-尖 3170
-芋 3171
-铠 3172
-沿 3173
-扮 3174
-隐 3175
-促 3176
-庾 3177
-葡 3178
-萄 3179
-硝 3180
-溶 3181
-淡 3182
-炅 3183
-昕 3184
-尧 3185
-妊 3186
-娠 3187
-曰 3188
-祛 3189
-枚 3190
-卒 3191
-陕 3192
-昭 3193
-龚 3194
-债 3195
-嗓 3196
-陷 3197
-阱 3198
-庞 3199
-盲 3200
-侵 3201
-匣 3202
-愤 3203
-怒 3204
-瞅 3205
-遭 3206
-脉 3207
-馒 3208
-愉 3209
-栗 3210
-鲍 3211
-挎 3212
-匆 3213
-缕 3214
-昵 3215
-鳄 3216
-阙 3217
-坟 3218
-捏 3219
-喽 3220
-雀 3221
-贯 3222
-苇 3223
-鹄 3224
-愈 3225
-裂 3226
-伪 3227
-劣 3228
-歹 3229
-溅 3230
-雌 3231
-猛 3232
-逞 3233
-饥 3234
-愚 3235
-牧 3236
-碰 3237
-帜 3238
-佝 3239
-偻 3240
-讪 3241
-馍 3242
-役 3243
-栈 3244
-唾 3245
-缆 3246
-袄 3247
-闸 3248
-织 3249
-筐 3250
-婉 3251
-昏 3252
-拖 3253
-毙 3254
-咙 3255
-褪 3256
-驼 3257
-壳 3258
-孽 3259
-审 3260
-脖 3261
-恳 3262
-孢 3263
-矫 3264
-臻 3265
-兖 3266
-俏 3267
-棍 3268
-唻 3269
-肘 3270
-俭 3271
-冕 3272
-葩 3273
-佑 3274
-鬓 3275
-柚 3276
-赴 3277
-崖 3278
-塌 3279
-厢 3280
-窈 3281
-窕 3282
-逑 3283
-卷 3284
-拂 3285
-蟑 3286
-螂 3287
-契 3288
-羞 3289
-函 3290
-逐 3291
-拌 3292
-肺 3293
-阻 3294
-纵 3295
-痰 3296
-狙 3297
-惋 3298
-枰 3299
-崽 3300
-胚 3301
-骡 3302
-萎 3303
-泄 3304
-呜 3305
-籁 3306
-濮 3307
-阆 3308
-琵 3309
-琶 3310
-跃 3311
-筝 3312
-勃 3313
-楂 3314
-奉 3315
-础 3316
-吆 3317
-壹 3318
-飙 3319
-虑 3320
-脆 3321
-黛 3322
-栓 3323
-逻 3324
-螃 3325
-轰 3326
-仑 3327
-券 3328
-逢 3329
-疮 3330
-私 3331
-窃 3332
-儋 3333
-泼 3334
-熬 3335
-焚 3336
-梨 3337
-吟 3338
-棱 3339
-稳 3340
-翘 3341
-祠 3342
-遮 3343
-瘤 3344
-稣 3345
-唇 3346
-阖 3347
-堡 3348
-禾 3349
-钗 3350
-爵 3351
-赐 3352
-绕 3353
-粘 3354
-癌 3355
-矜 3356
-虱 3357
-婧 3358
-坝 3359
-菏 3360
-隶 3361
-尺 3362
-滕 3363
-竿 3364
-恰 3365
-喱 3366
-冤 3367
-枉 3368
-叠 3369
-穴 3370
-搏 3371
-窦 3372
-栀 3373
-踪 3374
-昼 3375
-氯 3376
-陋 3377
-铭 3378
-禅 3379
-屑 3380
-巢 3381
-咻 3382
-喇 3383
-叭 3384
-棵 3385
-吊 3386
-诈 3387
-娇 3388
-绘 3389
-圩 3390
-仰 3391
-疙 3392
-瘩 3393
-桦 3394
-妾 3395
-丧 3396
-昊 3397
-湄 3398
-靴 3399
-迭 3400
-劝 3401
-溧 3402
-靡 3403
-梗 3404
-倪 3405
-刍 3406
-芽 3407
-篱 3408
-笆 3409
-漯 3410
-镖 3411
-协 3412
-叙 3413
-汾 3414
-豚 3415
-锷 3416
-瑙 3417
-瑜 3418
-伽 3419
-彰 3420
-扒 3421
-麝 3422
-赔 3423
-焉 3424
-亏 3425
-煅 3426
-翱 3427
-哽 3428
-煦 3429
-喷 3430
-舱 3431
-惨 3432
-哗 3433
-躲 3434
-佘 3435
-憾 3436
-旷 3437
-芹 3438
-簧 3439
-疹 3440
-簇 3441
-羹 3442
-刊 3443
-鹦 3444
-鹉 3445
-狡 3446
-猾 3447
-锯 3448
-呛 3449
-泛 3450
-汶 3451
-毗 3452
-衫 3453
-猕 3454
-祺 3455
-悸 3456
-昙 3457
-莽 3458
-杏 3459
-钮 3460
-叛 3461
-锄 3462
-砸 3463
-囤 3464
-犁 3465
-溃 3466
-疡 3467
-迦 3468
-轲 3469
-噜 3470
-犒 3471
-薰 3472
-薛 3473
-哺 3474
-竖 3475
-氟 3476
-渐 3477
-柒 3478
-贰 3479
-捌 3480
-傍 3481
-抹 3482
-褶 3483
-仇 3484
-偎 3485
-馅 3486
-旱 3487
-渭 3488
-昔 3489
-癣 3490
-挚 3491
-姻 3492
-炽 3493
-嘎 3494
-矢 3495
-汀 3496
-馋 3497
-淌 3498
-莓 3499
-貂 3500
-啧 3501
-茱 3502
-萸 3503
-涌 3504
-臀 3505
-恃 3506
-蒲 3507
-朴 3508
-嘟 3509
-扁 3510
-剂 3511
-歧 3512
-啪 3513
-啵 3514
-匠 3515
-帖 3516
-痒 3517
-睿 3518
-踢 3519
-衿 3520
-叽 3521
-崩 3522
-顽 3523
-嫌 3524
-扛 3525
-浔 3526
-拯 3527
-戊 3528
-戚 3529
-蛔 3530
-醇 3531
-笛 3532
-氛 3533
-沦 3534
-婊 3535
-仍 3536
-镍 3537
-渤 3538
-舶 3539
-哆 3540
-睹 3541
-萤 3542
-弧 3543
-辙 3544
-旯 3545
-纷 3546
-熄 3547
-挽 3548
-帘 3549
-蒸 3550
-橄 3551
-榄 3552
-滥 3553
-掩 3554
-兮 3555
-庸 3556
-玟 3557
-垦 3558
-惟 3559
-朕 3560
-脯 3561
-歪 3562
-吾 3563
-碘 3564
-锰 3565
-矾 3566
-拙 3567
-践 3568
-纠 3569
-赡 3570
-暨 3571
-凳 3572
-雾 3573
-缔 3574
-啫 3575
-毁 3576
-宥 3577
-邛 3578
-崃 3579
-禧 3580
-醛 3581
-滤 3582
-嘀 3583
-缪 3584
-萌 3585
-芥 3586
-胀 3587
-鲨 3588
-腩 3589
-勾 3590
-裳 3591
-雍 3592
-蹭 3593
-匹 3594
-髓 3595
-砍 3596
-孰 3597
-辩 3598
-唰 3599
-慷 3600
-慨 3601
-畏 3602
-坠 3603
-钝 3604
-箫 3605
-愧 3606
-劈 3607
-嘶 3608
-粮 3609
-轼 3610
-蟒 3611
-翊 3612
-澧 3613
-揽 3614
-烹 3615
-饪 3616
-踏 3617
-弛 3618
-婢 3619
-奸 3620
-掏 3621
-泓 3622
-袖 3623
-笈 3624
-刑 3625
-俑 3626
-浇 3627
-骊 3628
-蛀 3629
-蚤 3630
-杵 3631
-兹 3632
-晰 3633
-癫 3634
-痫 3635
-逝 3636
-炬 3637
-讼 3638
-陂 3639
-蚕 3640
-绸 3641
-槽 3642
-纨 3643
-牢 3644
-晃 3645
-窄 3646
-蒂 3647
-湃 3648
-硫 3649
-眨 3650
-耸 3651
-浠 3652
-梵 3653
-纺 3654
-贾 3655
-膨 3656
-阀 3657
-堀 3658
-扭 3659
-捂 3660
-扑 3661
-椭 3662
-鳟 3663
-丙 3664
-烯 3665
-冈 3666
-衷 3667
-牟 3668
-郫 3669
-畴 3670
-腥 3671
-亩 3672
-淤 3673
-禄 3674
-倘 3675
-烷 3676
-仆 3677
-刨 3678
-炜 3679
-挨 3680
-鳅 3681
-奚 3682
-峪 3683
-呻 3684
-佣 3685
-渔 3686
-肢 3687
-霏 3688
-旨 3689
-爰 3690
-吨 3691
-珑 3692
-隽 3693
-橙 3694
-箍 3695
-岚 3696
-啸 3697
-倌 3698
-剃 3699
-御 3700
-沸 3701
-棘 3702
-瘫 3703
-痪 3704
-仕 3705
-闺 3706
-炳 3707
-乏 3708
-拱 3709
-墅 3710
-铢 3711
-痤 3712
-琥 3713
-珈 3714
-荟 3715
-翩 3716
-搓 3717
-阮 3718
-芸 3719
-抠 3720
-弓 3721
-锣 3722
-赫 3723
-挡 3724
-侃 3725
-诶 3726
-沽 3727
-绫 3728
-濑 3729
-龈 3730
-乞 3731
-丐 3732
-宴 3733
-馁 3734
-牲 3735
-闰 3736
-亢 3737
-辫 3738
-铲 3739
-嫦 3740
-卵 3741
-佚 3742
-谬 3743
-倡 3744
-抑 3745
-赋 3746
-跆 3747
-削 3748
-氮 3749
-嫩 3750
-噻 3751
-蜗 3752
-鹂 3753
-靶 3754
-妥 3755
-衢 3756
-腻 3757
-砖 3758
-翎 3759
-拈 3760
-卉 3761
-皂 3762
-曦 3763
-荔 3764
-晤 3765
-曜 3766
-趵 3767
-纣 3768
-捞 3769
-蕲 3770
-猿 3771
-榈 3772
-憎 3773
-媚 3774
-绞 3775
-峙 3776
-饲 3777
-瑾 3778
-寡 3779
-釜 3780
-凸 3781
-凹 3782
-嫉 3783
-妒 3784
-婪 3785
-驸 3786
-荤 3787
-弥 3788
-蹦 3789
-驮 3790
-汞 3791
-唠 3792
-叨 3793
-袈 3794
-裟 3795
-毽 3796
-蔗 3797
-蹄 3798
-犍 3799
-珞 3800
-谚 3801
-煎 3802
-腋 3803
-瞳 3804
-丛 3805
-挪 3806
-榴 3807
-钩 3808
-梓 3809
-骁 3810
-烙 3811
-舜 3812
-暮 3813
-擀 3814
-兜 3815
-癜 3816
-姗 3817
-藜 3818
-擒 3819
-歼 3820
-冉 3821
-倚 3822
-漱 3823
-嫣 3824
-椰 3825
-隘 3826
-掐 3827
-栾 3828
-巍 3829
-咔 3830
-稽 3831
-惆 3832
-怅 3833
-镑 3834
-娲 3835
-芷 3836
-藻 3837
-伺 3838
-忌 3839
-桔 3840
-绅 3841
-坂 3842
-澜 3843
-嚓 3844
-苔 3845
-诣 3846
-倔 3847
-酿 3848
-槟 3849
-榔 3850
-粪 3851
-渺 3852
-馗 3853
-峨 3854
-碚 3855
-阎 3856
-巅 3857
-颊 3858
-戬 3859
-吒 3860
-鸵 3861
-岂 3862
-廖 3863
-娅 3864
-旬 3865
-猥 3866
-琐 3867
-扈 3868
-滔 3869
-枷 3870
-崴 3871
-捣 3872
-泻 3873
-甙 3874
-俯 3875
-撑 3876
-芮 3877
-舆 3878
-邂 3879
-逅 3880
-宪 3881
-晖 3882
-岔 3883
-哒 3884
-酵 3885
-痔 3886
-苓 3887
-捶 3888
-睫 3889
-裕 3890
-彤 3891
-潢 3892
-酉 3893
-聂 3894
-氨 3895
-嗷 3896
-皎 3897
-焖 3898
-袭 3899
-惦 3900
-惘 3901
-隙 3902
-彝 3903
-鞘 3904
-厄 3905
-殷 3906
-罕 3907
-嚏 3908
-拇 3909
-尹 3910
-蔻 3911
-颂 3912
-皖 3913
-霖 3914
-屉 3915
-崛 3916
-砣 3917
-穗 3918
-枸 3919
-杞 3920
-竣 3921
-勋 3922
-坍 3923
-溢 3924
-廓 3925
-煽 3926
-囚 3927
-涪 3928
-墩 3929
-琢 3930
-胳 3931
-膊 3932
-彼 3933
-陀 3934
-汹 3935
-柱 3936
-颁 3937
-闫 3938
-熠 3939
-叹 3940
-婿 3941
-娩 3942
-藓 3943
-岷 3944
-婺 3945
-桓 3946
-赁 3947
-罢 3948
-姊 3949
-瓢 3950
-桩 3951
-淫 3952
-堪 3953
-艰 3954
-枢 3955
-枞 3956
-晗 3957
-泷 3958
-逍 3959
-筱 3960
-烽 3961
-渍 3962
-蒿 3963
-殴 3964
-玖 3965
-罐 3966
-剿 3967
-喀 3968
-磕 3969
-铵 3970
-蕊 3971
-篓 3972
-痞 3973
-磅 3974
-礴 3975
-磐 3976
-拘 3977
-瘙 3978
-惕 3979
-孜 3980
-杖 3981
-撇 3982
-敖 3983
-踩 3984
-刹 3985
-蹿 3986
-坎 3987
-氦 3988
-汨 3989
-垣 3990
-垢 3991
-胁 3992
-趴 3993
-苷 3994
-镒 3995
-幢 3996
-鞠 3997
-逾 3998
-鬃 3999
-尉 4000
-韧 4001
-锤 4002
-嘘 4003
-呷 4004
-噎 4005
-煲 4006
-恍 4007
-粱 4008
-亳 4009
-鳞 4010
-懦 4011
-酚 4012
-酞 4013
-哨 4014
-祀 4015
-刃 4016
-蕴 4017
-晟 4018
-菀 4019
-甬 4020
-鼾 4021
-鳖 4022
-螳 4023
-稼 4024
-栽 4025
-蝗 4026
-颌 4027
-咀 4028
-掠 4029
-嘱 4030
-甚 4031
-菅 4032
-奂 4033
-讽 4034
-秸 4035
-釉 4036
-坞 4037
-雇 4038
-绢 4039
-捧 4040
-狈 4041
-桀 4042
-骜 4043
-摊 4044
-臆 4045
-竺 4046
-栅 4047
-贬 4048
-飒 4049
-浸 4050
-噩 4051
-晾 4052
-绐 4053
-殡 4054
-挠 4055
-於 4056
-茁 4057
-瞪 4058
-窠 4059
-汰 4060
-魁 4061
-忒 4062
-璋 4063
-怠 4064
-莺 4065
-冶 4066
-绰 4067
-邈 4068
-圻 4069
-湮 4070
-亨 4071
-躬 4072
-砂 4073
-鹭 4074
-浊 4075
-楹 4076
-珉 4077
-撵 4078
-筏 4079
-荨 4080
-鳝 4081
-沥 4082
-邳 4083
-殉 4084
-憨 4085
-啼 4086
-熏 4087
-蜃 4088
-毋 4089
-彗 4090
-噪 4091
-绛 4092
-祟 4093
-蝙 4094
-蝠 4095
-漆 4096
-酰 4097
-锑 4098
-栖 4099
-肆 4100
-邕 4101
-弋 4102
-绽 4103
-嚼 4104
-霹 4105
-雳 4106
-谍 4107
-恹 4108
-怏 4109
-倦 4110
-轶 4111
-曛 4112
-疚 4113
-棚 4114
-漕 4115
-浃 4116
-勘 4117
-暄 4118
-趁 4119
-斥 4120
-苞 4121
-膳 4122
-赎 4123
-崭 4124
-笙 4125
-摁 4126
-嗅 4127
-瞒 4128
-舵 4129
-铸 4130
-咫 4131
-涅 4132
-瘪 4133
-潼 4134
-粑 4135
-漾 4136
-噶 4137
-鸠 4138
-铉 4139
-豹 4140
-遛 4141
-襟 4142
-壤 4143
-甭 4144
-吮 4145
-耒 4146
-钊 4147
-泞 4148
-拦 4149
-昱 4150
-腑 4151
-惧 4152
-韬 4153
-焗 4154
-窘 4155
-喳 4156
-溏 4157
-鲛 4158
-慵 4159
-菁 4160
-攥 4161
-埔 4162
-呕 4163
-蓑 4164
-笠 4165
-孑 4166
-咕 4167
-觐 4168
-漓 4169
-碾 4170
-浜 4171
-嬉 4172
-迂 4173
-笃 4174
-勉 4175
-锥 4176
-篷 4177
-亥 4178
-龌 4179
-龊 4180
-煞 4181
-蓟 4182
-皓 4183
-惰 4184
-勺 4185
-缨 4186
-峥 4187
-苯 4188
-豁 4189
-颓 4190
-拽 4191
-啄 4192
-麒 4193
-雎 4194
-鲢 4195
-睬 4196
-渣 4197
-唔 4198
-桧 4199
-癞 4200
-蛤 4201
-蟆 4202
-撩 4203
-酯 4204
-戳 4205
-舔 4206
-孺 4207
-怂 4208
-恿 4209
-臃 4210
-戟 4211
-惭 4212
-耿 4213
-徵 4214
-柬 4215
-朽 4216
-磺 4217
-媲 4218
-懿 4219
-悼 4220
-绎 4221
-缅 4222
-茜 4223
-瞻 4224
-炀 4225
-脓 4226
-罄 4227
-秃 4228
-拎 4229
-譬 4230
-榉 4231
-拭 4232
-玥 4233
-崆 4234
-峒 4235
-胛 4236
-糗 4237
-佗 4238
-佬 4239
-袍 4240
-炊 4241
-仞 4242
-霎 4243
-掺 4244
-匀 4245
-姹 4246
-妯 4247
-娌 4248
-帷 4249
-岢 4250
-柄 4251
-阪 4252
-玺 4253
-窑 4254
-肽 4255
-涡 4256
-窟 4257
-阉 4258
-硼 4259
-蛳 4260
-呤 4261
-砚 4262
-偌 4263
-贿 4264
-芗 4265
-蹊 4266
-跷 4267
-雏 4268
-膝 4269
-嗜 4270
-扦 4271
-涟 4272
-殆 4273
-郡 4274
-洵 4275
-酋 4276
-匡 4277
-胤 4278
-撤 4279
-辗 4280
-冀 4281
-捺 4282
-吏 4283
-衩 4284
-腕 4285
-灸 4286
-绔 4287
-瓯 4288
-蜻 4289
-蜓 4290
-窜 4291
-躯 4292
-髦 4293
-诏 4294
-缄 4295
-筠 4296
-沌 4297
-酐 4298
-皋 4299
-隧 4300
-鹊 4301
-傀 4302
-儡 4303
-诲 4304
-嘏 4305
-寅 4306
-骇 4307
-喧 4308
-癀 4309
-瑚 4310
-碉 4311
-羔 4312
-掂 4313
-痹 4314
-孚 4315
-绡 4316
-馊 4317
-虏 4318
-悖 4319
-漪 4320
-琉 4321
-缉 4322
-冥 4323
-饯 4324
-蔺 4325
-瘆 4326
-榨 4327
-盯 4328
-鄞 4329
-妨 4330
-哐 4331
-寇 4332
-鹃 4333
-卞 4334
-喘 4335
-藩 4336
-踹 4337
-粟 4338
-陨 4339
-遣 4340
-鳌 4341
-烨 4342
-抉 4343
-臧 4344
-墉 4345
-疽 4346
-拷 4347
-赃 4348
-哮 4349
-馥 4350
-砰 4351
-拗 4352
-汐 4353
-矣 4354
-沅 4355
-裴 4356
-阐 4357
-蟋 4358
-蟀 4359
-蚀 4360
-恁 4361
-恙 4362
-蝉 4363
-荀 4364
-彧 4365
-銮 4366
-侮 4367
-驿 4368
-婶 4369
-檀 4370
-哩 4371
-镐 4372
-轴 4373
-扳 4374
-飓 4375
-麓 4376
-牺 4377
-垛 4378
-稞 4379
-桴 4380
-痧 4381
-揣 4382
-殇 4383
-邬 4384
-撅 4385
-邸 4386
-鼬 4387
-剥 4388
-胥 4389
-撼 4390
-溟 4391
-鄱 4392
-鲫 4393
-觅 4394
-犊 4395
-恕 4396
-铂 4397
-褔 4398
-淼 4399
-骝 4400
-藉 4401
-裔 4402
-痨 4403
-颤 4404
-尴 4405
-尬 4406
-癖 4407
-拄 4408
-蠕 4409
-虻 4410
-迄 4411
-攸 4412
-浚 4413
-盔 4414
-肮 4415
-侬 4416
-锏 4417
-憧 4418
-憬 4419
-镰 4420
-懈 4421
-挟 4422
-缤 4423
-涎 4424
-睾 4425
-惶 4426
-褚 4427
-藐 4428
-眺 4429
-艇 4430
-昀 4431
-妄 4432
-祗 4433
-壬 4434
-浯 4435
-衲 4436
-來 4437
-黯 4438
-芩 4439
-敞 4440
-绊 4441
-娣 4442
-掷 4443
-茯 4444
-琍 4445
-蛹 4446
-钧 4447
-瘘 4448
-蜥 4449
-蜴 4450
-唬 4451
-驭 4452
-阂 4453
-诃 4454
-疟 4455
-潦 4456
-谀 4457
-肱 4458
-黏 4459
-甥 4460
-眶 4461
-秩 4462
-庇 4463
-钏 4464
-咝 4465
-肴 4466
-宸 4467
-湟 4468
-沣 4469
-煊 4470
-盂 4471
-弈 4472
-瞩 4473
-聆 4474
-疥 4475
-腼 4476
-腆 4477
-胭 4478
-匕 4479
-讳 4480
-戮 4481
-茧 4482
-趾 4483
-亵 4484
-吖 4485
-漩 4486
-逵 4487
-寰 4488
-滇 4489
-渎 4490
-寮 4491
-嘁 4492
-珂 4493
-珀 4494
-稠 4495
-羌 4496
-徘 4497
-徊 4498
-苛 4499
-蕨 4500
-薏 4501
-苡 4502
-戌 4503
-卯 4504
-馈 4505
-溥 4506
-熹 4507
-屡 4508
-巳 4509
-璜 4510
-铮 4511
-踊 4512
-锚 4513
-濠 4514
-噫 4515
-怦 4516
-蓥 4517
-碌 4518
-霓 4519
-牦 4520
-妤 4521
-屹 4522
-缈 4523
-蹉 4524
-驷 4525
-菡 4526
-谔 4527
-琛 4528
-吡 4529
-喹 4530
-呲 4531
-溺 4532
-鳗 4533
-慑 4534
-秆 4535
-骋 4536
-脐 4537
-涤 4538
-荞 4539
-淅 4540
-罘 4541
-焱 4542
-孵 4543
-斟 4544
-酌 4545
-痊 4546
-秉 4547
-砌 4548
-瘁 4549
-胱 4550
-笫 4551
-燮 4552
-衅 4553
-腱 4554
-垒 4555
-锟 4556
-缀 4557
-疵 4558
-墟 4559
-盏 4560
-舂 4561
-侗 4562
-琨 4563
-唧 4564
-怆 4565
-沮 4566
-敛 4567
-瑕 4568
-奠 4569
-汴 4570
-衙 4571
-歆 4572
-嘹 4573
-饽 4574
-拧 4575
-濒 4576
-锭 4577
-嬴 4578
-吱 4579
-靳 4580
-眸 4581
-渲 4582
-睦 4583
-蝼 4584
-瞿 4585
-剁 4586
-紊 4587
-翟 4588
-攘 4589
-蹂 4590
-躏 4591
-淞 4592
-跎 4593
-侍 4594
-铛 4595
-绷 4596
-仟 4597
-瀚 4598
-赉 4599
-俪 4600
-魄 4601
-吼 4602
-酗 4603
-嚒 4604
-彷 4605
-徨 4606
-煜 4607
-曝 4608
-嗑 4609
-俅 4610
-嵘 4611
-隍 4612
-唆 4613
-郜 4614
-栩 4615
-尻 4616
-咗 4617
-茗 4618
-疱 4619
-斐 4620
-菘 4621
-芎 4622
-帼 4623
-枭 4624
-矩 4625
-仨 4626
-幄 4627
-鲸 4628
-猬 4629
-梢 4630
-槐 4631
-璧 4632
-坷 4633
-逯 4634
-踝 4635
-濡 4636
-樵 4637
-肓 4638
-劵 4639
-羚 4640
-髫 4641
-笄 4642
-俾 4643
-匿 4644
-帛 4645
-孀 4646
-焙 4647
-瘟 4648
-籽 4649
-萦 4650
-灼 4651
-箴 4652
-筵 4653
-窒 4654
-裆 4655
-旎 4656
-砝 4657
-妲 4658
-恺 4659
-覃 4660
-寐 4661
-酝 4662
-啃 4663
-塬 4664
-醴 4665
-蜿 4666
-蜒 4667
-愣 4668
-恤 4669
-撂 4670
-瘸 4671
-檐 4672
-琰 4673
-狒 4674
-摧 4675
-诠 4676
-孪 4677
-嘚 4678
-鼹 4679
-囡 4680
-茴 4681
-噤 4682
-僻 4683
-钕 4684
-锴 4685
-渗 4686
-嗫 4687
-撮 4688
-缭 4689
-粼 4690
-咄 4691
-挝 4692
-蛾 4693
-恪 4694
-皙 4695
-莒 4696
-叼 4697
-诽 4698
-妩 4699
-叱 4700
-咤 4701
-挞 4702
-萼 4703
-饵 4704
-澹 4705
-惺 4706
-呶 4707
-铤 4708
-佟 4709
-丕 4710
-靛 4711
-伶 4712
-涣 4713
-桢 4714
-狭 4715
-卅 4716
-蟠 4717
-蟾 4718
-朦 4719
-胧 4720
-咆 4721
-滦 4722
-岖 4723
-篙 4724
-痍 4725
-胰 4726
-谏 4727
-坳 4728
-樯 4729
-橹 4730
-孬 4731
-潴 4732
-厥 4733
-椐 4734
-谩 4735
-恬 4736
-琬 4737
-遁 4738
-褥 4739
-咎 4740
-羁 4741
-苣 4742
-殁 4743
-懵 4744
-褒 4745
-蜚 4746
-蛊 4747
-筛 4748
-耙 4749
-耨 4750
-嬷 4751
-驯 4752
-赅 4753
-畲 4754
-滢 4755
-伎 4756
-庹 4757
-踉 4758
-戎 4759
-膛 4760
-嗡 4761
-吔 4762
-唏 4763
-喏 4764
-哧 4765
-缇 4766
-蚝 4767
-璀 4768
-璨 4769
-捅 4770
-妁 4771
-曳 4772
-吩 4773
-咐 4774
-罂 4775
-垌 4776
-揪 4777
-壕 4778
-跺 4779
-辘 4780
-轳 4781
-噔 4782
-斓 4783
-厮 4784
-叁 4785
-仄 4786
-沼 4787
-鸢 4788
-醪 4789
-郢 4790
-圃 4791
-碜 4792
-鲅 4793
-嚯 4794
-淳 4795
-迩 4796
-诋 4797
-鬟 4798
-汲 4799
-艮 4800
-跤 4801
-麋 4802
-橇 4803
-悱 4804
-恻 4805
-啷 4806
-惮 4807
-樨 4808
-毓 4809
-裱 4810
-堇 4811
-埸 4812
-叵 4813
-腚 4814
-畀 4815
-钼 4816
-赦 4817
-悯 4818
-谴 4819
-稷 4820
-嘢 4821
-盎 4822
-跶 4823
-窥 4824
-瑄 4825
-谤 4826
-柘 4827
-垄 4828
-蠡 4829
-邝 4830
-娆 4831
-俐 4832
-铷 4833
-肋 4834
-涿 4835
-俎 4836
-捜 4837
-罡 4838
-嗝 4839
-唛 4840
-酣 4841
-鹬 4842
-瑀 4843
-帚 4844
-镭 4845
-搽 4846
-钣 4847
-蜇 4848
-嗞 4849
-颉 4850
-耘 4851
-忡 4852
-噼 4853
-睐 4854
-簋 4855
-镚 4856
-朐 4857
-戛 4858
-扪 4859
-鹩 4860
-稹 4861
-嗣 4862
-睇 4863
-弩 4864
-侥 4865
-绚 4866
-虔 4867
-溴 4868
-毂 4869
-漉 4870
-郧 4871
-杈 4872
-埭 4873
-哝 4874
-纾 4875
-箔 4876
-蚍 4877
-呋 4878
-喃 4879
-旌 4880
-袅 4881
-嫡 4882
-2 4883
-睢 4884
-榭 4885
-濉 4886
-雉 4887
-糍 4888
-谙 4889
-坻 4890
-遨 4891
-囔 4892
-鹜 4893
-垩 4894
-嵋 4895
-葑 4896
-叻 4897
-剌 4898
-铀 4899
-鲟 4900
-珏 4901
-唑 4902
-拴 4903
-乍 4904
-镊 4905
-歩 4906
-姘 4907
-戍 4908
-娈 4909
-槿 4910
-魇 4911
-叩 4912
-啾 4913
-腈 4914
-骞 4915
-殃 4916
-髋 4917
-嶙 4918
-璟 4919
-嚷 4920
-鹳 4921
-嗬 4922
-梆 4923
-晁 4924
-龛 4925
-嚎 4926
-熨 4927
-倭 4928
-峦 4929
-蜍 4930
-桉 4931
-齁 4932
-搀 4933
-铬 4934
-刽 4935
-謝 4936
-沒 4937
-簪 4938
-邺 4939
-嵬 4940
-馄 4941
-饨 4942
-蜢 4943
-嗒 4944
-芨 4945
-弶 4946
-晞 4947
-搔 4948
-昴 4949
-夙 4950
-徙 4951
-霾 4952
-嗖 4953
-碴 4954
-秧 4955
-芍 4956
-匝 4957
-泫 4958
-琯 4959
-扼 4960
-砒 4961
-栎 4962
-卟 4963
-琊 4964
-怯 4965
-侩 4966
-峯 4967
-忿 4968
-藁 4969
-蹼 4970
-毡 4971
-埤 4972
-膘 4973
-噗 4974
-阕 4975
-嘭 4976
-椿 4977
-涸 4978
-祯 4979
-芵 4980
-螨 4981
-寥 4982
-梶 4983
-嘈 4984
-泠 4985
-侏 4986
-棂 4987
-缶 4988
-捋 4989
-钜 4990
-璞 4991
-媞 4992
-唢 4993
-邰 4994
-蚱 4995
-薜 4996
-牒 4997
-缥 4998
-咿 4999
-遐 5000
-蕙 5001
-惬 5002
-惚 5003
-硚 5004
-麽 5005
-踌 5006
-褂 5007
-蜉 5008
-蝣 5009
-腌 5010
-熘 5011
-缮 5012
-锢 5013
-犽 5014
-蹬 5015
-皈 5016
-剔 5017
-芪 5018
-妪 5019
-钇 5020
-仃 5021
-荏 5022
-苒 5023
-塾 5024
-阡 5025
-瑨 5026
-冢 5027
-匈 5028
-庶 5029
-荃 5030
-茬 5031
-妗 5032
-暹 5033
-犷 5034
-嵴 5035
-鳃 5036
-羲 5037
-岱 5038
-烩 5039
-勐 5040
-霁 5041
-厝 5042
-飚 5043
-瀛 5044
-炕 5045
-桅 5046
-垓 5047
-晌 5048
-黒 5049
-蚩 5050
-夔 5051
-垚 5052
-烊 5053
-眀 5054
-荼 5055
-蘼 5056
-尅 5057
-舫 5058
-拣 5059
-蹋 5060
-劭 5061
-耆 5062
-陡 5063
-樽 5064
-谒 5065
-觞 5066
-箩 5067
-槛 5068
-傈 5069
-僳 5070
-爻 5071
-皑 5072
-滘 5073
-嬅 5074
-丶 5075
-邋 5076
-遢 5077
-讴 5078
-隅 5079
-邃 5080
-谑 5081
-哔 5082
-矬 5083
-姣 5084
-凛 5085
-冽 5086
-殒 5087
-眈 5088
-鹧 5089
-鸪 5090
-飕 5091
-亘 5092
-篝 5093
-嘅 5094
-乜 5095
-黜 5096
-颇 5097
-鄄 5098
-蔫 5099
-贻 5100
-猝 5101
-绌 5102
-芈 5103
-隼 5104
-戆 5105
-鹫 5106
-霑 5107
-宕 5108
-凇 5109
-铨 5110
-町 5111
-礁 5112
-蕃 5113
-淖 5114
-搐 5115
-饴 5116
-榛 5117
-晔 5118
-祢 5119
-酥 5120
-丨 5121
-赂 5122
-噘 5123
-黍 5124
-幌 5125
-骅 5126
-黝 5127
-帧 5128
-胯 5129
-埙 5130
-敕 5131
-涓 5132
-掣 5133
-圪 5134
-榻 5135
-濛 5136
-擞 5137
-篡 5138
-榷 5139
-亟 5140
-渌 5141
-锹 5142
-啐 5143
-捍 5144
-嘣 5145
-跻 5146
-桠 5147
-贮 5148
-蛰 5149
-猖 5150
-骸 5151
-溉 5152
-铎 5153
-吁 5154
-溯 5155
-踞 5156
-俨 5157
-茌 5158
-蒯 5159
-篆 5160
-膺 5161
-垭 5162
-匮 5163
-撰 5164
-擂 5165
-倜 5166
-傥 5167
-蔑 5168
-弼 5169
-珮 5170
-颢 5171
-钿 5172
-迸 5173
-凿 5174
-湫 5175
-焯 5176
-硒 5177
-畈 5178
-觑 5179
-揶 5180
-禀 5181
-宦 5182
-杷 5183
-讷 5184
-踮 5185
-掳 5186
-窿 5187
-捻 5188
-褴 5189
-褛 5190
-瑛 5191
-胫 5192
-喋 5193
-沓 5194
-汛 5195
-掴 5196
-魉 5197
-馀 5198
-隗 5199
-咘 5200
-呱 5201
-獭 5202
-畊 5203
-莜 5204
-祐 5205
-轧 5206
-魍 5207
-昶 5208
-诓 5209
-囗 5210
-莠 5211
-岌 5212
-潸 5213
-涞 5214
-綦 5215
-畸 5216
-阄 5217
-遏 5218
-啶 5219
-冇 5220
-懋 5221
-煨 5222
-羱 5223
-诟 5224
-枳 5225
-鲶 5226
-燊 5227
-猷 5228
-铄 5229
-缰 5230
-搪 5231
-赊 5232
-诩 5233
-佼 5234
-钵 5235
-谌 5236
-嬗 5237
-砥 5238
-砺 5239
-觊 5240
-觎 5241
-颅 5242
-怵 5243
-疸 5244
-锆 5245
-缢 5246
-棣 5247
-蛎 5248
-鄯 5249
-茸 5250
-谶 5251
-蹶 5252
-侑 5253
-滂 5254
-襁 5255
-褓 5256
-杳 5257
-臊 5258
-摒 5259
-袂 5260
-掸 5261
-鹞 5262
-忱 5263
-湉 5264
-汩 5265
-剽 5266
-槌 5267
-塍 5268
-喟 5269
-讹 5270
-抡 5271
-烃 5272
-咁 5273
-珺 5274
-槎 5275
-砼 5276
-泯 5277
-泮 5278
-遴 5279
-匾 5280
-沏 5281
-悌 5282
-麾 5283
-垡 5284
-鏖 5285
-垅 5286
-斛 5287
-镂 5288
-骷 5289
-髅 5290
-豺 5291
-诿 5292
-狰 5293
-狞 5294
-泱 5295
-榫 5296
-嗤 5297
-瞥 5298
-揄 5299
-哌 5300
-婀 5301
-恸 5302
-蛐 5303
-镀 5304
-霈 5305
-钒 5306
-踱 5307
-淆 5308
-薹 5309
-纭 5310
-瘠 5311
-戾 5312
-夭 5313
-铰 5314
-渚 5315
-犇 5316
-舀 5317
-傣 5318
-獗 5319
-瞭 5320
-兢 5321
-犟 5322
-袒 5323
-铖 5324
-颚 5325
-徜 5326
-徉 5327
-囍 5328
-酆 5329
-铡 5330
-睽 5331
-裨 5332
-饕 5333
-躇 5334
-噱 5335
-赓 5336
-懊 5337
-蟊 5338
-趸 5339
-鄢 5340
-埝 5341
-椟 5342
-粳 5343
-跛 5344
-莴 5345
-娉 5346
-嗄 5347
-邙 5348
-渑 5349
-佶 5350
-颍 5351
-溆 5352
-诧 5353
-抨 5354
-憷 5355
-涠 5356
-痼 5357
-砀 5358
-剐 5359
-缙 5360
-鞑 5361
-坭 5362
-烬 5363
-唁 5364
-臼 5365
-瓮 5366
-袱 5367
-珩 5368
-蝌 5369
-蚪 5370
-诬 5371
-迥 5372
-楸 5373
-皿 5374
-蜷 5375
-遑 5376
-啖 5377
-篪 5378
-崮 5379
-讧 5380
-盹 5381
-瞑 5382
-鲳 5383
-谟 5384
-拮 5385
-琏 5386
-瞰 5387
-憩 5388
-馏 5389
-炷 5390
-眩 5391
-羿 5392
-洙 5393
-珲 5394
-愫 5395
-佯 5396
-舸 5397
-祎 5398
-旮 5399
-翌 5400
-畿 5401
-桎 5402
-梏 5403
-钳 5404
-鳍 5405
-犸 5406
-祉 5407
-缜 5408
-硌 5409
-殓 5410
-砾 5411
-酩 5412
-酊 5413
-兀 5414
-矸 5415
-髙 5416
-疝 5417
-膑 5418
-哂 5419
-僚 5420
-耷 5421
-窨 5422
-孳 5423
-鲠 5424
-淝 5425
-搡 5426
-伢 5427
-鲷 5428
-谕 5429
-頫 5430
-泺 5431
-谧 5432
-煳 5433
-萁 5434
-馕 5435
-鹌 5436
-鹑 5437
-钴 5438
-埇 5439
-摈 5440
-踵 5441
-冗 5442
-铣 5443
-萃 5444
-忤 5445
-揩 5446
-铧 5447
-矗 5448
-闾 5449
-柞 5450
-貉 5451
-撺 5452
-掇 5453
-灞 5454
-醍 5455
-痱 5456
-粲 5457
-糠 5458
-讣 5459
-蹴 5460
-茆 5461
-螈 5462
-旻 5463
-蔼 5464
-咣 5465
-麸 5466
-涝 5467
-渥 5468
-垤 5469
-咭 5470
-玳 5471
-瑁 5472
-郏 5473
-纂 5474
-扞 5475
-峭 5476
-铩 5477
-锨 5478
-坩 5479
-埚 5480
-瑭 5481
-札 5482
-舛 5483
-臬 5484
-郯 5485
-晦 5486
-耄 5487
-耋 5488
-俚 5489
-鲭 5490
-柩 5491
-黟 5492
-骼 5493
-蛆 5494
-跋 5495
-俸 5496
-幡 5497
-愕 5498
-噙 5499
-峋 5500
-厩 5501
-夯 5502
-擢 5503
-枋 5504
-葳 5505
-偃 5506
-赝 5507
-昝 5508
-镉 5509
-嫔 5510
-潋 5511
-娓 5512
-郅 5513
-瘀 5514
-奄 5515
-荇 5516
-咂 5517
-痉 5518
-挛 5519
-祚 5520
-庖 5521
-纰 5522
-簌 5523
-淬 5524
-掮 5525
-俟 5526
-臾 5527
-雒 5528
-吋 5529
-颧 5530
-嗔 5531
-诘 5532
-焘 5533
-獾 5534
-氤 5535
-氲 5536
-鲲 5537
-麂 5538
-罹 5539
-澍 5540
-镳 5541
-囱 5542
-玷 5543
-嗳 5544
-擘 5545
-濂 5546
-逡 5547
-骛 5548
-镔 5549
-湍 5550
-讥 5551
-蹁 5552
-跹 5553
-淦 5554
-骰 5555
-疃 5556
-腓 5557
-嵇 5558
-怄 5559
-谯 5560
-啕 5561
-坯 5562
-钎 5563
-锒 5564
-伉 5565
-佻 5566
-腴 5567
-怼 5568
-浐 5569
-摹 5570
-僮 5571
-芾 5572
-矍 5573
-泔 5574
-蚬 5575
-屐 5576
-翕 5577
-唿 5578
-苋 5579
-氪 5580
-楔 5581
-莪 5582
-掬 5583
-舷 5584
-骐 5585
-嗲 5586
-荻 5587
-缱 5588
-绻 5589
-嫚 5590
-铟 5591
-饷 5592
-醐 5593
-伫 5594
-澶 5595
-郇 5596
-蹚 5597
-藿 5598
-鳕 5599
-蝈 5600
-钯 5601
-铍 5602
-骠 5603
-盅 5604
-蜊 5605
-腭 5606
-谘 5607
-孛 5608
-豇 5609
-囫 5610
-囵 5611
-抿 5612
-楣 5613
-廾 5614
-貔 5615
-貅 5616
-蛉 5617
-猹 5618
-蚴 5619
-轱 5620
-葚 5621
-胗 5622
-鸮 5623
-篦 5624
-谆 5625
-篑 5626
-莅 5627
-砷 5628
-蝾 5629
-疴 5630
-葺 5631
-瘴 5632
-滹 5633
-砭 5634
-噌 5635
-鸾 5636
-珙 5637
-碣 5638
-餮 5639
-荸 5640
-荠 5641
-犄 5642
-歙 5643
-樾 5644
-淙 5645
-痢 5646
-濯 5647
-轫 5648
-琮 5649
-啜 5650
-闳 5651
-椁 5652
-蓼 5653
-垴 5654
-唷 5655
-炔 5656
-峁 5657
-囹 5658
-尕 5659
-嗪 5660
-缎 5661
-拚 5662
-稔 5663
-牍 5664
-赳 5665
-忪 5666
-菖 5667
-佃 5668
-埂 5669
-宓 5670
-瞠 5671
-洹 5672
-锲 5673
-睑 5674
-攫 5675
-竽 5676
-蹩 5677
-慜 5678
-锉 5679
-羧 5680
-崧 5681
-醺 5682
-舐 5683
-讫 5684
-熵 5685
-▁GONNA 5686
-瘢 5687
-秭 5688
-跄 5689
-绀 5690
-懑 5691
-弭 5692
-萋 5693
-篁 5694
-缛 5695
-茭 5696
-吠 5697
-鲑 5698
-幔 5699
-潺 5700
-鹈 5701
-鹕 5702
-椴 5703
-哕 5704
-剜 5705
-湎 5706
-玑 5707
-槃 5708
-暌 5709
-蹒 5710
-跚 5711
-恣 5712
-磬 5713
-悭 5714
-劾 5715
-唳 5716
-绉 5717
-枇 5718
-蜱 5719
-瞟 5720
-膈 5721
-磴 5722
-嶂 5723
-苫 5724
-邡 5725
-骈 5726
-惴 5727
-硖 5728
-鳜 5729
-羸 5730
-秣 5731
-殚 5732
-桷 5733
-罔 5734
-颦 5735
-桁 5736
-鸩 5737
-孱 5738
-伥 5739
-愎 5740
-圄 5741
-贲 5742
-旖 5743
-荥 5744
-徇 5745
-镌 5746
-偈 5747
-敝 5748
-刎 5749
-跬 5750
-欸 5751
-髌 5752
-椤 5753
-觥 5754
-踟 5755
-斡 5756
-陉 5757
-谡 5758
-龅 5759
-鸨 5760
-豢 5761
-豉 5762
-悻 5763
-曈 5764
-茼 5765
-谗 5766
-忖 5767
-牯 5768
-痂 5769
-虢 5770
-馓 5771
-跖 5772
-聿 5773
-箅 5774
-塅 5775
-丼 5776
-獐 5777
-肏 5778
-逄 5779
-钡 5780
-叒 5781
-霭 5782
-鲮 5783
-凫 5784
-鹥 5785
-鳙 5786
-玦 5787
-蒡 5788
-嘬 5789
-鹗 5790
-鬄 5791
-鎏 5792
-嘤 5793
-绦 5794
-涔 5795
-齑 5796
-蒌 5797
-墘 5798
-俠 5799
-蛭 5800
-薅 5801
-叕 5802
-砧 5803
-嘧 5804
-媺 5805
-蚵 5806
-楽 5807
-浄 5808
-厍 5809
-鳊 5810
-泂 5811
-龋 5812
-瓒 5813
-瑧 5814
-邨 5815
-峣 5816
-蚺 5817
-鲉 5818
-滟 5819
-堑 5820
-豳 5821
-骧 5822
-艹 5823
-柾 5824
-鬣 5825
-眦 5826
-畦 5827
-虬 5828
-睨 5829
-飨 5830
-蘖 5831
-羟 5832
-瓤 5833
-岫 5834
-惇 5835
-鲵 5836
-痦 5837
-笤 5838
-憙 5839
-痩 5840
-煋 5841
-媤 5842
-佤 5843
-羮 5844
-鏊 5845
-昇 5846
-蛱 5847
-珅 5848
-庋 5849
-搵 5850
-旸 5851
-岿 5852
-亓 5853
-揸 5854
-谂 5855
-淠 5856
-糅 5857
-儆 5858
-苕 5859
-刿 5860
-呒 5861
-岙 5862
-荜 5863
-玧 5864
-鄠 5865
-讬 5866
-祕 5867
-箦 5868
-醚 5869
-膻 5870
-笕 5871
-蛏 5872
-哞 5873
-饸 5874
-饹 5875
-愻 5876
-汫 5877
-鹇 5878
-栉 5879
-沇 5880
-擤 5881
-徳 5882
-黢 5883
-狍 5884
-錫 5885
-暝 5886
-機 5887
-鉅 5888
-菓 5889
-廋 5890
-橛 5891
-羣 5892
-笊 5893
-魃 5894
-掼 5895
-魑 5896
-靥 5897
-酔 5898
-铱 5899
-峄 5900
-哋 5901
-畹 5902
-鍪 5903
-髀 5904
-嚄 5905
-秾 5906
-苾 5907
-孓 5908
-汆 5909
-嗟 5910
-锺 5911
-睥 5912
-炝 5913
-怔 5914
-咛 5915
-巉 5916
-墒 5917
-岘 5918
-禛 5919
-陟 5920
-皲 5921
-萘 5922
-妣 5923
-芃 5924
-煸 5925
-郦 5926
-蒗 5927
-仝 5928
-抻 5929
-苜 5930
-蓿 5931
-鎵 5932
-減 5933
-燧 5934
-娭 5935
-毑 5936
-诂 5937
-烔 5938
-猗 5939
-哏 5940
-氙 5941
-匯 5942
-颛 5943
-鞣 5944
-笺 5945
-枖 5946
-忾 5947
-黉 5948
-埯 5949
-敩 5950
-玏 5951
-钺 5952
-纥 5953
-佈 5954
-沖 5955
-蚜 5956
-莨 5957
-菟 5958
-麇 5959
-沤 5960
-耦 5961
-赭 5962
-祊 5963
-璄 5964
-旼 5965
-浞 5966
-痄 5967
-蔸 5968
-璎 5969
-屛 5970
-錤 5971
-弢 5972
-绨 5973
-員 5974
-誕 5975
-祜 5976
-勍 5977
-浉 5978
-娑 5979
-呓 5980
-啉 5981
-嗐 5982
-弁 5983
-绺 5984
-撷 5985
-崑 5986
-诌 5987
-標 5988
-甯 5989
-俣 5990
-趔 5991
-趄 5992
-垠 5993
-赟 5994
-馐 5995
-畑 5996
-給 5997
-幣 5998
-產 5999
-恵 6000
-併 6001
-蒹 6002
-葭 6003
-後 6004
-瀍 6005
-愠 6006
-莛 6007
-蝰 6008
-鹮 6009
-逶 6010
-侪 6011
-蒽 6012
-巽 6013
-瓴 6014
-鲱 6015
-薙 6016
-過 6017
-億 6018
-車 6019
-鲇 6020
-淨 6021
-嗎 6022
-诨 6023
-靚 6024
-內 6025
-糁 6026
-錾 6027
-刈 6028
-滯 6029
-炆 6030
-徂 6031
-傩 6032
-鲺 6033
-叟 6034
-埗 6035
-篠 6036
-焐 6037
-暻 6038
-盃 6039
-髻 6040
-樘 6041
-墈 6042
-菉 6043
-巯 6044
-嘌 6045
-遒 6046
-鼋 6047
-匍 6048
-匐 6049
-臜 6050
-馔 6051
-鲎 6052
-獠 6053
-蟇 6054
-栄 6055
-騎 6056
-賽 6057
-場 6058
-幾 6059
-鐘 6060
-镛 6061
-鸶 6062
-镆 6063
-窸 6064
-庠 6065
-蒺 6066
-溱 6067
-倮 6068
-楪 6069
-帀 6070
-躶 6071
-洰 6072
-圉 6073
-圊 6074
-捨 6075
-谝 6076
-呔 6077
-勖 6078
-揖 6079
-喈 6080
-霰 6081
-觋 6082
-嫪 6083
-毐 6084
-繇 6085
-珐 6086
-馃 6087
-孃 6088
-逖 6089
-骶 6090
-喬 6091
-奧 6092
-風 6093
-裵 6094
-胍 6095
-確 6096
-揠 6097
-榀 6098
-聒 6099
-谪 6100
-歘 6101
-粿 6102
-舾 6103
-聩 6104
-嫘 6105
-砟 6106
-侉 6107
-捯 6108
-饬 6109
-囏 6110
-喙 6111
-笥 6112
-燿 6113
-鮀 6114
-芡 6115
-蛄 6116
-铳 6117
-挲 6118
-笞 6119
-廿 6120
-蠹 6121
-湋 6122
-暎 6123
-霙 6124
-颔 6125
-苁 6126
-啮 6127
-囖 6128
-寤 6129
-炟 6130
-乩 6131
-熥 6132
-桡 6133
-阈 6134
-孖 6135
-鐢 6136
-衾 6137
-怍 6138
-沆 6139
-囿 6140
-胬 6141
-陲 6142
-缦 6143
-誇 6144
-醮 6145
-箬 6146
-盥 6147
-鹘 6148
-诳 6149
-氡 6150
-狎 6151
-枧 6152
-谄 6153
-芣 6154
-苢 6155
-俤 6156
-誊 6157
-殄 6158
-辋 6159
-係 6160
-迤 6161
-謦 6162
-簰 6163
-滓 6164
-嬢 6165
-倏 6166
-睺 6167
-滏 6168
-脘 6169
-嗙 6170
-谥 6171
-歃 6172
-锃 6173
-欻 6174
-挼 6175
-襙 6176
-檄 6177
-龇 6178
-楫 6179
-咵 6180
-徭 6181
-闱 6182
-嚅 6183
-鳑 6184
-鲏 6185
-佞 6186
-箜 6187
-篌 6188
-蹑 6189
-喑 6190
-胄 6191
-鞥 6192
-蟥 6193
-骢 6194
-蹙 6195
-柰 6196
-蕤 6197
-癸 6198
-哙 6199
-睚 6200
-绾 6201
-篾 6202
-鳏 6203
-谲 6204
-袤 6205
-翳 6206
-蹰 6207
-槊 6208
-黠 6209
-姒 6210
-锱 6211
-猢 6212
-狲 6213
-粝 6214
-戕 6215
-茕 6216
-瀣 6217
-踽 6218
-绶 6219
-媾 6220
-舢 6221
-螯 6222
-茏 6223
-廪 6224
-诰 6225
-辇 6226
-琚 6227
-汜 6228
-洇 6229
-還 6230
-遽 6231
-槁 6232
-靼 6233
-髡 6234
-鸬 6235
-鹚 6236
-捭 6237
-黩 6238
-俶 6239
-個 6240
-圜 6241
-颞 6242
-苻 6243
-恽 6244
-腧 6245
-甾 6246
-辎 6247
-顼 6248
-阗 6249
-鬻 6250
-鬶 6251
-沔 6252
-狃 6253
-#0 6254
-#1 6255
-#2 6256
diff --git a/egs/aishell/ASR/seamlessm4t/train.py b/egs/aishell/ASR/seamlessm4t/train.py
deleted file mode 100644
index 4802473c9..000000000
--- a/egs/aishell/ASR/seamlessm4t/train.py
+++ /dev/null
@@ -1,1254 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2023 Xiaomi Corp. (authors: Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-
-./prepare.sh
-
-If you use --datatang-prob=0, then you don't need to run the above script.
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./pruned_transducer_stateless7/train.py \
- --world-size 4 \
- --num-epochs 30 \
- --start-epoch 1 \
- --use-fp16 1 \
- --exp-dir pruned_transducer_stateless7/exp \
- --full-libri 1 \
- --max-duration 550
-"""
-
-
-import argparse
-import copy
-import logging
-import random
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple, Union
-
-import k2
-import optim
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from typing import List
-#from aishell import AIShell
-#from asr_datamodule import AsrDataModule
-from asr_datamodule import AishellAsrDataModule
-#from decoder import Decoder
-#from joiner import Joiner
-from lhotse import CutSet, load_manifest
-from lhotse.cut import Cut
-from lhotse.dataset.sampling.base import CutSampler
-from lhotse.utils import fix_random_seed
-#from model import Transducer
-from optim import Eden, ScaledAdam
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.functional import pad as pad_tensor
-from torch.utils.tensorboard import SummaryWriter
-#from zipformer import Zipformer
-
-from icefall import diagnostics
-#from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import load_checkpoint, remove_checkpoints
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import (
- save_checkpoint_with_global_batch_idx,
- update_averaged_model,
-)
-from icefall.dist import cleanup_dist, setup_dist, get_world_size, get_rank, get_local_rank
-from icefall.env import get_env_info
-from icefall.hooks import register_inf_check_hooks
-from icefall.lexicon import Lexicon
-from icefall.utils import (
- AttributeDict,
- MetricsTracker,
- filter_uneven_sized_batch,
- setup_logger,
- str2bool,
-)
-
-from seamless_communication.models.unity import (
- UnitTokenizer,
- UnitYModel,
- load_unity_model,
- load_unity_text_tokenizer,
- load_unity_unit_tokenizer,
-)
-from fairseq2.generation import (
- Seq2SeqGenerator,
- SequenceGeneratorOptions,
- SequenceGeneratorOutput,
- SequenceToTextGenerator,
- SequenceToTextOutput,
-)
-from fairseq2.data.text import (
- SentencePieceDecoder,
- SentencePieceEncoder,
- SentencePieceModel,
- TextTokenDecoder,
- TextTokenEncoder,
- TextTokenizer,
- vocabulary_from_sentencepiece,
-)
-
-from label_smoothing import LabelSmoothingLoss
-
-LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
-
-
-def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
- if isinstance(model, DDP):
- # get underlying nn.Module
- model = model.module
- for module in model.modules():
- if hasattr(module, "batch_count"):
- module.batch_count = batch_count
-
-
-def add_model_arguments(parser: argparse.ArgumentParser):
- parser.add_argument(
- "--num-encoder-layers",
- type=str,
- default="2,4,3,2,4",
- help="Number of zipformer encoder layers, comma separated.",
- )
-
- parser.add_argument(
- "--feedforward-dims",
- type=str,
- default="1024,1024,2048,2048,1024",
- help="Feedforward dimension of the zipformer encoder layers, comma separated.",
- )
-
- parser.add_argument(
- "--nhead",
- type=str,
- default="8,8,8,8,8",
- help="Number of attention heads in the zipformer encoder layers.",
- )
-
- parser.add_argument(
- "--encoder-dims",
- type=str,
- default="384,384,384,384,384",
- help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
- )
-
- parser.add_argument(
- "--attention-dims",
- type=str,
- default="192,192,192,192,192",
- help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
- not the same as embedding dimension.""",
- )
-
- parser.add_argument(
- "--encoder-unmasked-dims",
- type=str,
- default="256,256,256,256,256",
- help="Unmasked dimensions in the encoders, relates to augmentation during training. "
- "Must be <= each of encoder_dims. Empirically, less than 256 seems to make performance "
- " worse.",
- )
-
- parser.add_argument(
- "--zipformer-downsampling-factors",
- type=str,
- default="1,2,4,8,2",
- help="Downsampling factor for each stack of encoder layers.",
- )
-
- parser.add_argument(
- "--cnn-module-kernels",
- type=str,
- default="31,31,31,31,31",
- help="Sizes of kernels in convolution modules",
- )
-
- parser.add_argument(
- "--decoder-dim",
- type=int,
- default=512,
- help="Embedding dimension in the decoder model.",
- )
-
- parser.add_argument(
- "--joiner-dim",
- type=int,
- default=512,
- help="""Dimension used in the joiner model.
- Outputs from the encoder and decoder model are projected
- to this dimension before adding.
- """,
- )
-
-
-def get_parser():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
-
- parser.add_argument(
- "--master-port",
- type=int,
- default=12354,
- help="Master port to use for DDP training.",
- )
-
- parser.add_argument(
- "--tensorboard",
- type=str2bool,
- default=True,
- help="Should various information be logged in tensorboard.",
- )
-
- parser.add_argument(
- "--num-epochs",
- type=int,
- default=30,
- help="Number of epochs to train.",
- )
-
- parser.add_argument(
- "--start-epoch",
- type=int,
- default=1,
- help="""Resume training from this epoch. It should be positive.
- If larger than 1, it will load checkpoint from
- exp-dir/epoch-{start_epoch-1}.pt
- """,
- )
-
- parser.add_argument(
- "--start-batch",
- type=int,
- default=0,
- help="""If positive, --start-epoch is ignored and
- it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
- """,
- )
-
- parser.add_argument(
- "--exp-dir",
- type=str,
- default="pruned_transducer_stateless7/exp",
- help="""The experiment dir.
- It specifies the directory where all training related
- files, e.g., checkpoints, log, etc, are saved
- """,
- )
-
- parser.add_argument(
- "--lang-dir",
- type=str,
- default="data/lang_char",
- help="""The lang dir
- It contains language related input files such as
- "lexicon.txt"
- """,
- )
-
- parser.add_argument(
- "--base-lr", type=float, default=0.05, help="The base learning rate."
- )
-
- parser.add_argument(
- "--lr-batches",
- type=float,
- default=5000,
- help="""Number of steps that affects how rapidly the learning rate
- decreases. We suggest not to change this.""",
- )
-
- parser.add_argument(
- "--lr-epochs",
- type=float,
- default=6,
- help="""Number of epochs that affects how rapidly the learning rate decreases.
- """,
- )
-
- parser.add_argument(
- "--context-size",
- type=int,
- default=1,
- help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
- )
-
- parser.add_argument(
- "--prune-range",
- type=int,
- default=5,
- help="The prune range for rnnt loss, it means how many symbols(context)"
- "we are using to compute the loss",
- )
-
- parser.add_argument(
- "--lm-scale",
- type=float,
- default=0.25,
- help="The scale to smooth the loss with lm "
- "(output of prediction network) part.",
- )
-
- parser.add_argument(
- "--am-scale",
- type=float,
- default=0.0,
- help="The scale to smooth the loss with am (output of encoder network) part.",
- )
-
- parser.add_argument(
- "--simple-loss-scale",
- type=float,
- default=0.5,
- help="To get pruning ranges, we will calculate a simple version"
- "loss(joiner is just addition), this simple loss also uses for"
- "training (as a regularization item). We will scale the simple loss"
- "with this parameter before adding to the final loss.",
- )
-
- parser.add_argument(
- "--seed",
- type=int,
- default=42,
- help="The seed for random generators intended for reproducibility",
- )
-
- parser.add_argument(
- "--print-diagnostics",
- type=str2bool,
- default=False,
- help="Accumulate stats on activations, print them and exit.",
- )
-
- parser.add_argument(
- "--inf-check",
- type=str2bool,
- default=False,
- help="Add hooks to check for infinite module outputs and gradients.",
- )
-
- parser.add_argument(
- "--save-every-n",
- type=int,
- default=4000,
- help="""Save checkpoint after processing this number of batches"
- periodically. We save checkpoint to exp-dir/ whenever
- params.batch_idx_train % save_every_n == 0. The checkpoint filename
- has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
- Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
- end of each epoch where `xxx` is the epoch number counting from 0.
- """,
- )
-
- parser.add_argument(
- "--keep-last-k",
- type=int,
- default=30,
- help="""Only keep this number of checkpoints on disk.
- For instance, if it is 3, there are only 3 checkpoints
- in the exp-dir with filenames `checkpoint-xxx.pt`.
- It does not affect checkpoints with name `epoch-xxx.pt`.
- """,
- )
-
- parser.add_argument(
- "--average-period",
- type=int,
- default=200,
- help="""Update the averaged model, namely `model_avg`, after processing
- this number of batches. `model_avg` is a separate version of model,
- in which each floating-point parameter is the average of all the
- parameters from the start of training. Each time we take the average,
- we do: `model_avg = model * (average_period / batch_idx_train) +
- model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
- """,
- )
-
- parser.add_argument(
- "--use-fp16",
- type=str2bool,
- default=False,
- help="Whether to use half precision training.",
- )
-
- add_model_arguments(parser)
-
- return parser
-
-
-def get_params() -> AttributeDict:
- """Return a dict containing training parameters.
-
- All training related parameters that are not passed from the commandline
- are saved in the variable `params`.
-
- Commandline options are merged into `params` after they are parsed, so
- you can also access them via `params`.
-
- Explanation of options saved in `params`:
-
- - best_train_loss: Best training loss so far. It is used to select
- the model that has the lowest training loss. It is
- updated during the training.
-
- - best_valid_loss: Best validation loss so far. It is used to select
- the model that has the lowest validation loss. It is
- updated during the training.
-
- - best_train_epoch: It is the epoch that has the best training loss.
-
- - best_valid_epoch: It is the epoch that has the best validation loss.
-
- - batch_idx_train: Used to writing statistics to tensorboard. It
- contains number of batches trained so far across
- epochs.
-
- - log_interval: Print training loss if batch_idx % log_interval` is 0
-
- - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
- - valid_interval: Run validation if batch_idx % valid_interval is 0
-
- - feature_dim: The model input dim. It has to match the one used
- in computing features.
-
- - subsampling_factor: The subsampling factor for the model.
-
- - encoder_dim: Hidden dim for multi-head attention model.
-
- - num_decoder_layers: Number of decoder layer of transformer decoder.
-
- - warm_step: The warmup period that dictates the decay of the
- scale on "simple" (un-pruned) loss.
- """
- params = AttributeDict(
- {
- "frame_shift_ms": 10.0,
- "allowed_excess_duration_ratio": 0.1,
- "best_train_loss": float("inf"),
- "best_valid_loss": float("inf"),
- "best_train_epoch": -1,
- "best_valid_epoch": -1,
- "batch_idx_train": 0,
- "log_interval": 50,
- "reset_interval": 200,
- "valid_interval": 3000, # For the 100h subset, use 800
- # parameters for zipformer
- "feature_dim": 80,
- "subsampling_factor": 4, # not passed in, this is fixed.
- "warm_step": 100,
- "env_info": get_env_info(),
- }
- )
-
- return params
-
-
-# def get_transducer_model(params: AttributeDict) -> nn.Module:
-# encoder = get_encoder_model(params)
-# decoder = get_decoder_model(params)
-# joiner = get_joiner_model(params)
-
-# model = Transducer(
-# encoder=encoder,
-# decoder=decoder,
-# joiner=joiner,
-# encoder_dim=int(params.encoder_dims.split(",")[-1]),
-# decoder_dim=params.decoder_dim,
-# joiner_dim=params.joiner_dim,
-# vocab_size=params.vocab_size,
-# )
-# return model
-
-
-def load_checkpoint_if_available(
- params: AttributeDict,
- model: nn.Module,
- model_avg: nn.Module = None,
- optimizer: Optional[torch.optim.Optimizer] = None,
- scheduler: Optional[LRSchedulerType] = None,
-) -> Optional[Dict[str, Any]]:
- """Load checkpoint from file.
-
- If params.start_batch is positive, it will load the checkpoint from
- `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
- params.start_epoch is larger than 1, it will load the checkpoint from
- `params.start_epoch - 1`.
-
- Apart from loading state dict for `model` and `optimizer` it also updates
- `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
- and `best_valid_loss` in `params`.
-
- Args:
- params:
- The return value of :func:`get_params`.
- model:
- The training model.
- model_avg:
- The stored model averaged from the start of training.
- optimizer:
- The optimizer that we are using.
- scheduler:
- The scheduler that we are using.
- Returns:
- Return a dict containing previously saved training info.
- """
- if params.start_batch > 0:
- filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
- elif params.start_epoch > 1:
- filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
- else:
- return None
-
- assert filename.is_file(), f"{filename} does not exist!"
-
- saved_params = load_checkpoint(
- filename,
- model=model,
- model_avg=model_avg,
- optimizer=optimizer,
- scheduler=scheduler,
- )
-
- keys = [
- "best_train_epoch",
- "best_valid_epoch",
- "batch_idx_train",
- "best_train_loss",
- "best_valid_loss",
- ]
- for k in keys:
- params[k] = saved_params[k]
-
- if params.start_batch > 0:
- if "cur_epoch" in saved_params:
- params["start_epoch"] = saved_params["cur_epoch"]
-
- return saved_params
-
-
-def save_checkpoint(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- model_avg: Optional[nn.Module] = None,
- optimizer: Optional[torch.optim.Optimizer] = None,
- scheduler: Optional[LRSchedulerType] = None,
- sampler: Optional[CutSampler] = None,
- scaler: Optional[GradScaler] = None,
- rank: int = 0,
-) -> None:
- """Save model, optimizer, scheduler and training stats to file.
-
- Args:
- params:
- It is returned by :func:`get_params`.
- model:
- The training model.
- model_avg:
- The stored model averaged from the start of training.
- optimizer:
- The optimizer used in the training.
- sampler:
- The sampler for the training dataset.
- scaler:
- The scaler used for mix precision training.
- """
- if rank != 0:
- return
- filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
- save_checkpoint_impl(
- filename=filename,
- model=model,
- model_avg=model_avg,
- params=params,
- optimizer=optimizer,
- scheduler=scheduler,
- sampler=sampler,
- scaler=scaler,
- rank=rank,
- )
-
- if params.best_train_epoch == params.cur_epoch:
- best_train_filename = params.exp_dir / "best-train-loss.pt"
- copyfile(src=filename, dst=best_train_filename)
-
- if params.best_valid_epoch == params.cur_epoch:
- best_valid_filename = params.exp_dir / "best-valid-loss.pt"
- copyfile(src=filename, dst=best_valid_filename)
-
-def compute_loss(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- text_tokenizer_encoder: SentencePieceEncoder,
- batch: dict,
- is_training: bool,
-) -> Tuple[Tensor, MetricsTracker]:
- """
- Compute RNN-T loss given the model and its inputs.
-
- Args:
- params:
- Parameters for training. See :func:`get_params`.
- model:
- The model for training. It is an instance of Zipformer in our case.
- batch:
- A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
- for the content in it.
- is_training:
- True for training. False for validation. When it is True, this
- function enables autograd during computation; when it is False, it
- disables autograd.
- warmup: a floating point value which increases throughout training;
- values >= 1.0 are fully warmed up and have all modules present.
- """
- # For the uneven-sized batch, the total duration after padding would possibly
- # cause OOM. Hence, for each batch, which is sorted descendingly by length,
- # we simply drop the last few shortest samples, so that the retained total frames
- # (after padding) would not exceed `allowed_max_frames`:
- # `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
- # where `max_frames = max_duration * 1000 // frame_shift_ms`.
- # We set allowed_excess_duration_ratio=0.1.
- if isinstance(model, DDP):
- # get underlying nn.Module
- model = model.module
- def _batch_tensors(tensors: List[Tensor], pad_value: Any) -> Tensor:
- padding_size = max(tensor.shape[0] for tensor in tensors)
- dims = len(tensors[0].shape)
- padded_tensors = []
- for tensor in tensors:
- padding = [0] * 2 * dims
- padding[-1] = padding_size - tensor.shape[0]
- padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value))
- return torch.stack([tensor for tensor in padded_tensors], dim=0)
-
- max_frames = params.max_duration * 1000 // params.frame_shift_ms
- allowed_max_frames = int(max_frames * (1.0 + params.allowed_excess_duration_ratio))
- batch = filter_uneven_sized_batch(batch, allowed_max_frames)
-
- device = model.device if isinstance(model, DDP) else next(model.parameters()).device
- feature = batch["inputs"]
- # at entry, feature is (N, T, C)
- assert feature.ndim == 3
- feature = feature.to(device)
-
- supervisions = batch["supervisions"]
- feature_lens = supervisions["num_frames"].to(device)
-
- batch_idx_train = params.batch_idx_train
- warm_step = params.warm_step
-
- texts = batch["supervisions"]["text"]
- text_tokens_list = [text_tokenizer_encoder(text) for text in texts]
- prev_outputs_tokens = _batch_tensors(
- [tokens[:-1] for tokens in text_tokens_list], pad_value=params.pad_idx
- )
- target_tokens = _batch_tensors(
- [tokens[1:] for tokens in text_tokens_list], pad_value=params.pad_idx
- )
- target_lengths = torch.LongTensor(
- [tokens.shape[0] - 1 for tokens in text_tokens_list]
- )
- decoder_criterion = LabelSmoothingLoss(ignore_index=params.pad_idx, label_smoothing=0.1, reduction="sum")
- ignore_prefix_size = 1 # ignroe the lang code prediction
-
- with torch.set_grad_enabled(is_training):
- speech_encoder_out, speech_encoder_padding_mask = model.encode_speech(
- seqs=feature,
- seq_lens=feature_lens,
- )
- #assert batch.speech_to_text.prev_output_tokens is not None
- text_decoder_out, text_decoder_padding_mask = model.decode(
- seqs=prev_outputs_tokens.to(device),
- seq_lens=target_lengths.to(device),
- encoder_output=speech_encoder_out,
- encoder_padding_mask=speech_encoder_padding_mask,
- )
- text_logits = model.final_proj(text_decoder_out)
- text_logits = text_logits[:, ignore_prefix_size:, :]
- target_tokens = target_tokens[:, ignore_prefix_size:]
- loss = decoder_criterion(text_logits, target_tokens.to(device))
-
- assert loss.requires_grad == is_training
-
- info = MetricsTracker()
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
-
- # Note: We use reduction=sum while computing the loss.
- info["loss"] = loss.detach().cpu().item()
-
- return loss, info
-
-
-def compute_validation_loss(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- text_tokenizer_encoder: SentencePieceEncoder,
- valid_dl: torch.utils.data.DataLoader,
- world_size: int = 1,
-) -> MetricsTracker:
- """Run the validation process."""
- model.eval()
-
- tot_loss = MetricsTracker()
-
- for batch_idx, batch in enumerate(valid_dl):
- loss, loss_info = compute_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- batch=batch,
- is_training=False,
- )
- assert loss.requires_grad is False
- tot_loss = tot_loss + loss_info
-
- if world_size > 1:
- tot_loss.reduce(loss.device)
-
- loss_value = tot_loss["loss"] / tot_loss["frames"]
- if loss_value < params.best_valid_loss:
- params.best_valid_epoch = params.cur_epoch
- params.best_valid_loss = loss_value
-
- return tot_loss
-
-
-def train_one_epoch(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- optimizer: torch.optim.Optimizer,
- scheduler: LRSchedulerType,
- text_tokenizer_encoder: SentencePieceEncoder,
- train_dl: torch.utils.data.DataLoader,
- valid_dl: torch.utils.data.DataLoader,
- scaler: GradScaler,
- model_avg: Optional[nn.Module] = None,
- tb_writer: Optional[SummaryWriter] = None,
- world_size: int = 1,
- rank: int = 0,
-) -> None:
- """Train the model for one epoch.
-
- The training loss from the mean of all frames is saved in
- `params.train_loss`. It runs the validation process every
- `params.valid_interval` batches.
-
- Args:
- params:
- It is returned by :func:`get_params`.
- model:
- The model for training.
- optimizer:
- The optimizer we are using.
- scheduler:
- The learning rate scheduler, we call step() every step.
- train_dl:
- Dataloader for the training dataset.
- valid_dl:
- Dataloader for the validation dataset.
- scaler:
- The scaler used for mix precision training.
- model_avg:
- The stored model averaged from the start of training.
- tb_writer:
- Writer to write log messages to tensorboard.
- world_size:
- Number of nodes in DDP training. If it is 1, DDP is disabled.
- rank:
- The rank of the node in DDP training. If no DDP is used, it should
- be set to 0.
- """
- model.train()
-
- tot_loss = MetricsTracker()
-
- for batch_idx, batch in enumerate(train_dl):
- params.batch_idx_train += 1
- batch_size = len(batch["supervisions"]["text"])
-
- try:
- with torch.cuda.amp.autocast(enabled=params.use_fp16):
- loss, loss_info = compute_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- batch=batch,
- is_training=True,
- )
- # summary stats
- tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
- # NOTE: We use reduction==sum and loss is computed over utterances
- # in the batch and there is no normalization to it so far.
- scaler.scale(loss).backward()
- set_batch_count(model, params.batch_idx_train)
- scheduler.step_batch(params.batch_idx_train)
-
- scaler.step(optimizer)
- scaler.update()
- optimizer.zero_grad()
- except: # noqa
- display_and_save_batch(batch, params=params)
- raise
-
- if params.print_diagnostics and batch_idx == 5:
- return
-
- if (
- rank == 0
- and params.batch_idx_train > 0
- and params.batch_idx_train % params.average_period == 0
- ):
- update_averaged_model(
- params=params,
- model_cur=model,
- model_avg=model_avg,
- )
-
- if (
- params.batch_idx_train > 0
- and params.batch_idx_train % params.save_every_n == 0
- ):
- save_checkpoint_with_global_batch_idx(
- out_dir=params.exp_dir,
- global_batch_idx=params.batch_idx_train,
- model=model,
- model_avg=model_avg,
- params=params,
- optimizer=optimizer,
- scheduler=scheduler,
- sampler=train_dl.sampler,
- scaler=scaler,
- rank=rank,
- )
- remove_checkpoints(
- out_dir=params.exp_dir,
- topk=params.keep_last_k,
- rank=rank,
- )
-
- if batch_idx % 100 == 0 and params.use_fp16:
- # If the grad scale was less than 1, try increasing it. The _growth_interval
- # of the grad scaler is configurable, but we can't configure it to have different
- # behavior depending on the current grad scale.
- cur_grad_scale = scaler._scale.item()
- if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
- scaler.update(cur_grad_scale * 2.0)
- if cur_grad_scale < 0.01:
- logging.warning(f"Grad scale is small: {cur_grad_scale}")
- if cur_grad_scale < 1.0e-05:
- raise RuntimeError(
- f"grad_scale is too small, exiting: {cur_grad_scale}"
- )
- if batch_idx % params.log_interval == 0:
- cur_lr = scheduler.get_last_lr()[0]
- cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
-
- logging.info(
- f"Epoch {params.cur_epoch}, "
- f"batch {batch_idx}, loss[{loss_info}], "
- f"tot_loss[{tot_loss}], batch size: {batch_size}, "
- f"lr: {cur_lr:.2e}, "
- + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
- )
-
- if tb_writer is not None:
- tb_writer.add_scalar(
- "train/learning_rate", cur_lr, params.batch_idx_train
- )
-
- loss_info.write_summary(
- tb_writer, "train/current_", params.batch_idx_train
- )
- tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
- if params.use_fp16:
- tb_writer.add_scalar(
- "train/grad_scale",
- cur_grad_scale,
- params.batch_idx_train,
- )
-
- if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
- logging.info("Computing validation loss")
- valid_info = compute_validation_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- valid_dl=valid_dl,
- world_size=world_size,
- )
- model.train()
- logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
- logging.info(
- f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
- )
- if tb_writer is not None:
- valid_info.write_summary(
- tb_writer, "train/valid_", params.batch_idx_train
- )
-
- loss_value = tot_loss["loss"] / tot_loss["frames"]
- params.train_loss = loss_value
- if params.train_loss < params.best_train_loss:
- params.best_train_epoch = params.cur_epoch
- params.best_train_loss = params.train_loss
-
-
-def run(rank, world_size, args):
- """
- Args:
- rank:
- It is a value between 0 and `world_size-1`, which is
- passed automatically by `mp.spawn()` in :func:`main`.
- The node with rank 0 is responsible for saving checkpoint.
- world_size:
- Number of GPUs for DDP training.
- args:
- The return value of get_parser().parse_args()
- """
- params = get_params()
- params.update(vars(args))
-
- fix_random_seed(params.seed)
- # rank = get_rank()
- # world_size = get_world_size()
- # setup_dist(rank, world_size, use_ddp_launch=True)
- setup_dist(use_ddp_launch=True)
-
- setup_logger(f"{params.exp_dir}/log/log-train")
- logging.info("Training started")
-
- if args.tensorboard and rank == 0:
- tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
- else:
- tb_writer = None
-
- device = torch.device("cpu")
- if torch.cuda.is_available():
- device = torch.device("cuda", rank)
- logging.info(f"Device: {device}")
-
-
-
-
- logging.info("About to create model")
- model_name_or_card = "seamlessM4T_medium"
- lang = "cmn"
- model = load_unity_model(model_name_or_card, device="cpu", dtype=torch.float32)
- del model.t2u_model
- del model.text_encoder
- del model.text_encoder_frontend
- # print(vars(model))
- # exit(0)
- text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
- text_tokenizer_encoder = SentencePieceEncoder(
- text_tokenizer.model,
- prefix_tokens=["", f"__{lang}__"],
- suffix_tokens=[""],
- )
- #params.eos_idx = text_tokenizer.model.eos_idx
- params.pad_idx = text_tokenizer.model.pad_idx
- logging.info(params)
-
- num_param = sum([p.numel() for p in model.parameters()])
- logging.info(f"Number of model parameters: {num_param}")
-
- assert params.save_every_n >= params.average_period
- model_avg: Optional[nn.Module] = None
- if rank == 0:
- # model_avg is only used with rank 0
- model_avg = copy.deepcopy(model).to(torch.float64)
-
- assert params.start_epoch > 0, params.start_epoch
- checkpoints = load_checkpoint_if_available(
- params=params, model=model, model_avg=model_avg
- )
-
- model.to(device)
- if world_size > 1:
- logging.info("Using DDP")
- model = DDP(model, device_ids=[rank], find_unused_parameters=True)
-
- #parameters_names = []
- #parameters_names.append(
- # [name_param_pair[0] for name_param_pair in model.named_parameters()]
- #)
- # optimizer = ScaledAdam(
- # model.parameters(),
- # lr=params.base_lr,
- # clipping_scale=2.0,
- # parameters_names=parameters_names,
- # )
- optimizer = ScaledAdam(
- model.parameters(),
- lr=params.base_lr,
- clipping_scale=2.0,
- )
- scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
-
- if checkpoints and "optimizer" in checkpoints:
- logging.info("Loading optimizer state dict")
- optimizer.load_state_dict(checkpoints["optimizer"])
-
- if (
- checkpoints
- and "scheduler" in checkpoints
- and checkpoints["scheduler"] is not None
- ):
- logging.info("Loading scheduler state dict")
- scheduler.load_state_dict(checkpoints["scheduler"])
-
- if params.print_diagnostics:
- opts = diagnostics.TensorDiagnosticOptions(
- 2**22
- ) # allow 4 megabytes per sub-module
- diagnostic = diagnostics.attach_diagnostics(model, opts)
-
- if params.inf_check:
- register_inf_check_hooks(model)
-
- def remove_short_and_long_utt(c: Cut):
- # Keep only utterances with duration between 1 second and 20 seconds
- #
- # Caution: There is a reason to select 20.0 here. Please see
- # ../local/display_manifest_statistics.py
- #
- # You should use ../local/display_manifest_statistics.py to get
- # an utterance duration distribution for your dataset to select
- # the threshold
- if c.duration < 1.0 or c.duration > 12.0:
- logging.warning(
- f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
- )
- return False
-
- # In pruned RNN-T, we require that T >= S
- # where T is the number of feature frames after subsampling
- # and S is the number of tokens in the utterance
-
- # In ./zipformer.py, the conv module uses the following expression
- # for subsampling
- # T = ((c.num_frames - 7) // 2 + 1) // 2
- # tokens = sp.encode(c.supervisions[0].text, out_type=str)
-
- # if T < len(tokens):
- # logging.warning(
- # f"Exclude cut with ID {c.id} from training. "
- # f"Number of frames (before subsampling): {c.num_frames}. "
- # f"Number of frames (after subsampling): {T}. "
- # f"Text: {c.supervisions[0].text}. "
- # f"Tokens: {tokens}. "
- # f"Number of tokens: {len(tokens)}"
- # )
- # return False
-
- return True
-
- #aishell = AIShell(manifest_dir=args.manifest_dir)
- #train_cuts = aishell.train_cuts()
- #asr_datamodule = AishellAsrDataModule(args)
-
- aishell = AishellAsrDataModule(args)
- # train_cuts = asr_datamodule.train_cuts()
- # train_cuts = train_cuts.filter(remove_short_and_long_utt)
-
- # if args.enable_musan:
- # cuts_musan = load_manifest(Path(args.manifest_dir) / "musan_cuts.jsonl.gz")
- # else:
- # cuts_musan = None
-
-
-
- if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
- # We only load the sampler's state dict when it loads a checkpoint
- # saved in the middle of an epoch
- sampler_state_dict = checkpoints["sampler"]
- else:
- sampler_state_dict = None
-
- # train_dl = asr_datamodule.train_dataloaders(
- # train_cuts,
- # on_the_fly_feats=False,
- # cuts_musan=cuts_musan,
- # sampler_state_dict=sampler_state_dict,
- # )
-
- # valid_cuts = aishell.valid_cuts()
- # valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
- train_dl = aishell.train_dataloaders(aishell.train_cuts())
- valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
- # if not params.print_diagnostics:
- # scan_pessimistic_batches_for_oom(
- # model=model,
- # train_dl=train_dl,
- # optimizer=optimizer,
- # graph_compiler=graph_compiler,
- # params=params,
- # )
-
- scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
- if checkpoints and "grad_scaler" in checkpoints:
- logging.info("Loading grad scaler state dict")
- scaler.load_state_dict(checkpoints["grad_scaler"])
-
- logging.info(f"start training from epoch {params.start_epoch}")
- for epoch in range(params.start_epoch, params.num_epochs + 1):
- scheduler.step_epoch(epoch - 1)
- fix_random_seed(params.seed + epoch - 1)
- train_dl.sampler.set_epoch(epoch - 1)
-
- if tb_writer is not None:
- tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
- params.cur_epoch = epoch
-
- train_one_epoch(
- params=params,
- model=model,
- model_avg=model_avg,
- optimizer=optimizer,
- scheduler=scheduler,
- text_tokenizer_encoder=text_tokenizer_encoder,
- train_dl=train_dl,
- valid_dl=valid_dl,
- scaler=scaler,
- tb_writer=tb_writer,
- world_size=world_size,
- rank=rank,
- )
-
- if params.print_diagnostics:
- diagnostic.print_diagnostics()
- break
-
- save_checkpoint(
- params=params,
- model=model,
- model_avg=model_avg,
- optimizer=optimizer,
- scheduler=scheduler,
- sampler=train_dl.sampler,
- scaler=scaler,
- rank=rank,
- )
-
- logging.info("Done!")
-
- if world_size > 1:
- torch.distributed.barrier()
- cleanup_dist()
-
-
-def display_and_save_batch(
- batch: dict,
- params: AttributeDict,
-) -> None:
- """Display the batch statistics and save the batch into disk.
-
- Args:
- batch:
- A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
- for the content in it.
- params:
- Parameters for training. See :func:`get_params`.
- """
- from lhotse.utils import uuid4
-
- filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
- logging.info(f"Saving batch to {filename}")
- torch.save(batch, filename)
-
- supervisions = batch["supervisions"]
- features = batch["inputs"]
-
- logging.info(f"features shape: {features.shape}")
-
- # y = graph_compiler.texts_to_ids(supervisions["text"])
- # num_tokens = sum(len(i) for i in y)
- # logging.info(f"num tokens: {num_tokens}")
-
-
-def scan_pessimistic_batches_for_oom(
- model: Union[nn.Module, DDP],
- train_dl: torch.utils.data.DataLoader,
- optimizer: torch.optim.Optimizer,
- params: AttributeDict,
- text_tokenizer_encoder: SentencePieceEncoder,
-):
- from lhotse.dataset import find_pessimistic_batches
-
- logging.info(
- "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
- )
- batches, crit_values = find_pessimistic_batches(train_dl.sampler)
- for criterion, cuts in batches.items():
- batch = train_dl.dataset[cuts]
- try:
- with torch.cuda.amp.autocast(enabled=params.use_fp16):
- loss, _ = compute_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- batch=batch,
- is_training=True,
- )
- loss.backward()
- optimizer.zero_grad()
- except Exception as e:
- if "CUDA out of memory" in str(e):
- logging.error(
- "Your GPU ran out of memory with the current "
- "max_duration setting. We recommend decreasing "
- "max_duration and trying again.\n"
- f"Failing criterion: {criterion} "
- f"(={crit_values[criterion]}) ..."
- )
- display_and_save_batch(batch, params=params)
- raise
- logging.info(
- f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
- )
-
-
-def main():
- parser = get_parser()
- AishellAsrDataModule.add_arguments(parser)
- args = parser.parse_args()
- args.exp_dir = Path(args.exp_dir)
-
- world_size = get_world_size()
- rank = get_rank()
- assert world_size >= 1
-
- run(rank=rank, world_size=world_size, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
- main()
diff --git a/egs/aishell/ASR/seamlessm4t/train2.py b/egs/aishell/ASR/seamlessm4t/train2.py
deleted file mode 100644
index 9d5cf4ab9..000000000
--- a/egs/aishell/ASR/seamlessm4t/train2.py
+++ /dev/null
@@ -1,1277 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2023 Xiaomi Corp. (authors: Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-
-./prepare.sh
-
-If you use --datatang-prob=0, then you don't need to run the above script.
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./pruned_transducer_stateless7/train.py \
- --world-size 4 \
- --num-epochs 30 \
- --start-epoch 1 \
- --use-fp16 1 \
- --exp-dir pruned_transducer_stateless7/exp \
- --full-libri 1 \
- --max-duration 550
-"""
-
-
-import argparse
-import copy
-import logging
-import random
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple, Union
-
-import k2
-import optim
-import torch
-import torch.multiprocessing as mp
-import torch.nn as nn
-from typing import List
-#from aishell import AIShell
-#from asr_datamodule import AsrDataModule
-from asr_datamodule import AishellAsrDataModule
-#from decoder import Decoder
-#from joiner import Joiner
-from lhotse import CutSet, load_manifest
-from lhotse.cut import Cut
-from lhotse.dataset.sampling.base import CutSampler
-from lhotse.utils import fix_random_seed
-#from model import Transducer
-from optim import Eden, ScaledAdam
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.functional import pad as pad_tensor
-from torch.utils.tensorboard import SummaryWriter
-#from zipformer import Zipformer
-
-from icefall import diagnostics
-#from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
-from icefall.checkpoint import load_checkpoint, remove_checkpoints
-from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import (
- save_checkpoint_with_global_batch_idx,
- update_averaged_model,
-)
-from icefall.dist import cleanup_dist, setup_dist, get_world_size, get_rank, get_local_rank
-from icefall.env import get_env_info
-from icefall.hooks import register_inf_check_hooks
-from icefall.lexicon import Lexicon
-from icefall.utils import (
- AttributeDict,
- MetricsTracker,
- filter_uneven_sized_batch,
- setup_logger,
- str2bool,
-)
-
-from seamless_communication.models.unity import (
- UnitTokenizer,
- UnitYModel,
- load_unity_model,
- load_unity_text_tokenizer,
- load_unity_unit_tokenizer,
-)
-from fairseq2.generation import (
- Seq2SeqGenerator,
- SequenceGeneratorOptions,
- SequenceGeneratorOutput,
- SequenceToTextGenerator,
- SequenceToTextOutput,
-)
-from fairseq2.data.text import (
- SentencePieceDecoder,
- SentencePieceEncoder,
- SentencePieceModel,
- TextTokenDecoder,
- TextTokenEncoder,
- TextTokenizer,
- vocabulary_from_sentencepiece,
-)
-from tokenizer import CharTokenizer
-from label_smoothing import LabelSmoothingLoss
-from fairseq2.nn.embedding import Embedding
-from fairseq2.nn.projection import TiedProjection
-
-LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
-
-
-def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
- if isinstance(model, DDP):
- # get underlying nn.Module
- model = model.module
- for module in model.modules():
- if hasattr(module, "batch_count"):
- module.batch_count = batch_count
-
-
-def add_model_arguments(parser: argparse.ArgumentParser):
- parser.add_argument(
- "--num-encoder-layers",
- type=str,
- default="2,4,3,2,4",
- help="Number of zipformer encoder layers, comma separated.",
- )
-
- parser.add_argument(
- "--feedforward-dims",
- type=str,
- default="1024,1024,2048,2048,1024",
- help="Feedforward dimension of the zipformer encoder layers, comma separated.",
- )
-
- parser.add_argument(
- "--nhead",
- type=str,
- default="8,8,8,8,8",
- help="Number of attention heads in the zipformer encoder layers.",
- )
-
- parser.add_argument(
- "--encoder-dims",
- type=str,
- default="384,384,384,384,384",
- help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
- )
-
- parser.add_argument(
- "--attention-dims",
- type=str,
- default="192,192,192,192,192",
- help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
- not the same as embedding dimension.""",
- )
-
- parser.add_argument(
- "--encoder-unmasked-dims",
- type=str,
- default="256,256,256,256,256",
- help="Unmasked dimensions in the encoders, relates to augmentation during training. "
- "Must be <= each of encoder_dims. Empirically, less than 256 seems to make performance "
- " worse.",
- )
-
- parser.add_argument(
- "--zipformer-downsampling-factors",
- type=str,
- default="1,2,4,8,2",
- help="Downsampling factor for each stack of encoder layers.",
- )
-
- parser.add_argument(
- "--cnn-module-kernels",
- type=str,
- default="31,31,31,31,31",
- help="Sizes of kernels in convolution modules",
- )
-
- parser.add_argument(
- "--decoder-dim",
- type=int,
- default=512,
- help="Embedding dimension in the decoder model.",
- )
-
- parser.add_argument(
- "--joiner-dim",
- type=int,
- default=512,
- help="""Dimension used in the joiner model.
- Outputs from the encoder and decoder model are projected
- to this dimension before adding.
- """,
- )
-
-
-def get_parser():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
-
- parser.add_argument(
- "--master-port",
- type=int,
- default=12354,
- help="Master port to use for DDP training.",
- )
-
- parser.add_argument(
- "--tensorboard",
- type=str2bool,
- default=True,
- help="Should various information be logged in tensorboard.",
- )
-
- parser.add_argument(
- "--num-epochs",
- type=int,
- default=30,
- help="Number of epochs to train.",
- )
-
- parser.add_argument(
- "--start-epoch",
- type=int,
- default=1,
- help="""Resume training from this epoch. It should be positive.
- If larger than 1, it will load checkpoint from
- exp-dir/epoch-{start_epoch-1}.pt
- """,
- )
-
- parser.add_argument(
- "--start-batch",
- type=int,
- default=0,
- help="""If positive, --start-epoch is ignored and
- it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
- """,
- )
-
- parser.add_argument(
- "--exp-dir",
- type=str,
- default="pruned_transducer_stateless7/exp",
- help="""The experiment dir.
- It specifies the directory where all training related
- files, e.g., checkpoints, log, etc, are saved
- """,
- )
-
- parser.add_argument(
- "--lang-dir",
- type=str,
- default="data/lang_char",
- help="""The lang dir
- It contains language related input files such as
- "lexicon.txt"
- """,
- )
-
- parser.add_argument(
- "--base-lr", type=float, default=0.05, help="The base learning rate."
- )
-
- parser.add_argument(
- "--lr-batches",
- type=float,
- default=5000,
- help="""Number of steps that affects how rapidly the learning rate
- decreases. We suggest not to change this.""",
- )
-
- parser.add_argument(
- "--lr-epochs",
- type=float,
- default=6,
- help="""Number of epochs that affects how rapidly the learning rate decreases.
- """,
- )
-
- parser.add_argument(
- "--context-size",
- type=int,
- default=1,
- help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
- )
-
- parser.add_argument(
- "--prune-range",
- type=int,
- default=5,
- help="The prune range for rnnt loss, it means how many symbols(context)"
- "we are using to compute the loss",
- )
-
- parser.add_argument(
- "--lm-scale",
- type=float,
- default=0.25,
- help="The scale to smooth the loss with lm "
- "(output of prediction network) part.",
- )
-
- parser.add_argument(
- "--am-scale",
- type=float,
- default=0.0,
- help="The scale to smooth the loss with am (output of encoder network) part.",
- )
-
- parser.add_argument(
- "--simple-loss-scale",
- type=float,
- default=0.5,
- help="To get pruning ranges, we will calculate a simple version"
- "loss(joiner is just addition), this simple loss also uses for"
- "training (as a regularization item). We will scale the simple loss"
- "with this parameter before adding to the final loss.",
- )
-
- parser.add_argument(
- "--seed",
- type=int,
- default=42,
- help="The seed for random generators intended for reproducibility",
- )
-
- parser.add_argument(
- "--print-diagnostics",
- type=str2bool,
- default=False,
- help="Accumulate stats on activations, print them and exit.",
- )
-
- parser.add_argument(
- "--inf-check",
- type=str2bool,
- default=False,
- help="Add hooks to check for infinite module outputs and gradients.",
- )
-
- parser.add_argument(
- "--save-every-n",
- type=int,
- default=4000,
- help="""Save checkpoint after processing this number of batches"
- periodically. We save checkpoint to exp-dir/ whenever
- params.batch_idx_train % save_every_n == 0. The checkpoint filename
- has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
- Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
- end of each epoch where `xxx` is the epoch number counting from 0.
- """,
- )
-
- parser.add_argument(
- "--keep-last-k",
- type=int,
- default=30,
- help="""Only keep this number of checkpoints on disk.
- For instance, if it is 3, there are only 3 checkpoints
- in the exp-dir with filenames `checkpoint-xxx.pt`.
- It does not affect checkpoints with name `epoch-xxx.pt`.
- """,
- )
-
- parser.add_argument(
- "--average-period",
- type=int,
- default=200,
- help="""Update the averaged model, namely `model_avg`, after processing
- this number of batches. `model_avg` is a separate version of model,
- in which each floating-point parameter is the average of all the
- parameters from the start of training. Each time we take the average,
- we do: `model_avg = model * (average_period / batch_idx_train) +
- model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
- """,
- )
-
- parser.add_argument(
- "--use-fp16",
- type=str2bool,
- default=False,
- help="Whether to use half precision training.",
- )
-
- add_model_arguments(parser)
-
- return parser
-
-
-def get_params() -> AttributeDict:
- """Return a dict containing training parameters.
-
- All training related parameters that are not passed from the commandline
- are saved in the variable `params`.
-
- Commandline options are merged into `params` after they are parsed, so
- you can also access them via `params`.
-
- Explanation of options saved in `params`:
-
- - best_train_loss: Best training loss so far. It is used to select
- the model that has the lowest training loss. It is
- updated during the training.
-
- - best_valid_loss: Best validation loss so far. It is used to select
- the model that has the lowest validation loss. It is
- updated during the training.
-
- - best_train_epoch: It is the epoch that has the best training loss.
-
- - best_valid_epoch: It is the epoch that has the best validation loss.
-
- - batch_idx_train: Used to writing statistics to tensorboard. It
- contains number of batches trained so far across
- epochs.
-
- - log_interval: Print training loss if batch_idx % log_interval` is 0
-
- - reset_interval: Reset statistics if batch_idx % reset_interval is 0
-
- - valid_interval: Run validation if batch_idx % valid_interval is 0
-
- - feature_dim: The model input dim. It has to match the one used
- in computing features.
-
- - subsampling_factor: The subsampling factor for the model.
-
- - encoder_dim: Hidden dim for multi-head attention model.
-
- - num_decoder_layers: Number of decoder layer of transformer decoder.
-
- - warm_step: The warmup period that dictates the decay of the
- scale on "simple" (un-pruned) loss.
- """
- params = AttributeDict(
- {
- "frame_shift_ms": 10.0,
- "allowed_excess_duration_ratio": 0.1,
- "best_train_loss": float("inf"),
- "best_valid_loss": float("inf"),
- "best_train_epoch": -1,
- "best_valid_epoch": -1,
- "batch_idx_train": 0,
- "log_interval": 50,
- "reset_interval": 200,
- "valid_interval": 3000, # For the 100h subset, use 800
- # parameters for zipformer
- "feature_dim": 80,
- "subsampling_factor": 4, # not passed in, this is fixed.
- "warm_step": 100,
- "env_info": get_env_info(),
- }
- )
-
- return params
-
-
-# def get_transducer_model(params: AttributeDict) -> nn.Module:
-# encoder = get_encoder_model(params)
-# decoder = get_decoder_model(params)
-# joiner = get_joiner_model(params)
-
-# model = Transducer(
-# encoder=encoder,
-# decoder=decoder,
-# joiner=joiner,
-# encoder_dim=int(params.encoder_dims.split(",")[-1]),
-# decoder_dim=params.decoder_dim,
-# joiner_dim=params.joiner_dim,
-# vocab_size=params.vocab_size,
-# )
-# return model
-
-
-def load_checkpoint_if_available(
- params: AttributeDict,
- model: nn.Module,
- model_avg: nn.Module = None,
- optimizer: Optional[torch.optim.Optimizer] = None,
- scheduler: Optional[LRSchedulerType] = None,
-) -> Optional[Dict[str, Any]]:
- """Load checkpoint from file.
-
- If params.start_batch is positive, it will load the checkpoint from
- `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
- params.start_epoch is larger than 1, it will load the checkpoint from
- `params.start_epoch - 1`.
-
- Apart from loading state dict for `model` and `optimizer` it also updates
- `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
- and `best_valid_loss` in `params`.
-
- Args:
- params:
- The return value of :func:`get_params`.
- model:
- The training model.
- model_avg:
- The stored model averaged from the start of training.
- optimizer:
- The optimizer that we are using.
- scheduler:
- The scheduler that we are using.
- Returns:
- Return a dict containing previously saved training info.
- """
- if params.start_batch > 0:
- filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
- elif params.start_epoch > 1:
- filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
- else:
- return None
-
- assert filename.is_file(), f"{filename} does not exist!"
-
- saved_params = load_checkpoint(
- filename,
- model=model,
- model_avg=model_avg,
- optimizer=optimizer,
- scheduler=scheduler,
- )
-
- keys = [
- "best_train_epoch",
- "best_valid_epoch",
- "batch_idx_train",
- "best_train_loss",
- "best_valid_loss",
- ]
- for k in keys:
- params[k] = saved_params[k]
-
- if params.start_batch > 0:
- if "cur_epoch" in saved_params:
- params["start_epoch"] = saved_params["cur_epoch"]
-
- return saved_params
-
-
-def save_checkpoint(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- model_avg: Optional[nn.Module] = None,
- optimizer: Optional[torch.optim.Optimizer] = None,
- scheduler: Optional[LRSchedulerType] = None,
- sampler: Optional[CutSampler] = None,
- scaler: Optional[GradScaler] = None,
- rank: int = 0,
-) -> None:
- """Save model, optimizer, scheduler and training stats to file.
-
- Args:
- params:
- It is returned by :func:`get_params`.
- model:
- The training model.
- model_avg:
- The stored model averaged from the start of training.
- optimizer:
- The optimizer used in the training.
- sampler:
- The sampler for the training dataset.
- scaler:
- The scaler used for mix precision training.
- """
- if rank != 0:
- return
- filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
- save_checkpoint_impl(
- filename=filename,
- model=model,
- model_avg=model_avg,
- params=params,
- optimizer=optimizer,
- scheduler=scheduler,
- sampler=sampler,
- scaler=scaler,
- rank=rank,
- )
-
- if params.best_train_epoch == params.cur_epoch:
- best_train_filename = params.exp_dir / "best-train-loss.pt"
- copyfile(src=filename, dst=best_train_filename)
-
- if params.best_valid_epoch == params.cur_epoch:
- best_valid_filename = params.exp_dir / "best-valid-loss.pt"
- copyfile(src=filename, dst=best_valid_filename)
-
-def compute_loss(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- text_tokenizer_encoder: CharTokenizer,
- batch: dict,
- is_training: bool,
-) -> Tuple[Tensor, MetricsTracker]:
- """
- Compute RNN-T loss given the model and its inputs.
-
- Args:
- params:
- Parameters for training. See :func:`get_params`.
- model:
- The model for training. It is an instance of Zipformer in our case.
- batch:
- A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
- for the content in it.
- is_training:
- True for training. False for validation. When it is True, this
- function enables autograd during computation; when it is False, it
- disables autograd.
- warmup: a floating point value which increases throughout training;
- values >= 1.0 are fully warmed up and have all modules present.
- """
- # For the uneven-sized batch, the total duration after padding would possibly
- # cause OOM. Hence, for each batch, which is sorted descendingly by length,
- # we simply drop the last few shortest samples, so that the retained total frames
- # (after padding) would not exceed `allowed_max_frames`:
- # `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
- # where `max_frames = max_duration * 1000 // frame_shift_ms`.
- # We set allowed_excess_duration_ratio=0.1.
- if isinstance(model, DDP):
- # get underlying nn.Module
- model = model.module
- def _batch_tensors(tensors: List[Tensor], pad_value: Any) -> Tensor:
- padding_size = max(tensor.shape[0] for tensor in tensors)
- dims = len(tensors[0].shape)
- padded_tensors = []
- for tensor in tensors:
- padding = [0] * 2 * dims
- padding[-1] = padding_size - tensor.shape[0]
- padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value))
- return torch.stack([tensor for tensor in padded_tensors], dim=0)
-
- max_frames = params.max_duration * 1000 // params.frame_shift_ms
- allowed_max_frames = int(max_frames * (1.0 + params.allowed_excess_duration_ratio))
- batch = filter_uneven_sized_batch(batch, allowed_max_frames)
-
- device = model.device if isinstance(model, DDP) else next(model.parameters()).device
- feature = batch["inputs"]
- # at entry, feature is (N, T, C)
- assert feature.ndim == 3
- feature = feature.to(device)
-
- supervisions = batch["supervisions"]
- feature_lens = supervisions["num_frames"].to(device)
-
- batch_idx_train = params.batch_idx_train
- warm_step = params.warm_step
-
- texts = batch["supervisions"]["text"]
- # remove spaces in the text
- texts = [text.replace(" ", "") for text in texts]
- text_tokens_list = [torch.tensor([params.eos_idx] + text_tokenizer_encoder.encode(text) + [params.eos_idx]) for text in texts]
- prev_outputs_tokens = _batch_tensors(
- [tokens[:-1] for tokens in text_tokens_list], pad_value=params.pad_idx
- )
- target_tokens = _batch_tensors(
- [tokens[1:] for tokens in text_tokens_list], pad_value=params.pad_idx
- )
- target_lengths = torch.LongTensor(
- [tokens.shape[0] - 1 for tokens in text_tokens_list]
- )
- decoder_criterion = LabelSmoothingLoss(ignore_index=params.pad_idx, label_smoothing=0.1, reduction="sum")
- ignore_prefix_size = 1 # ignroe the lang code prediction
-
- with torch.set_grad_enabled(is_training):
- speech_encoder_out, speech_encoder_padding_mask = model.encode_speech(
- seqs=feature,
- seq_lens=feature_lens,
- )
- #assert batch.speech_to_text.prev_output_tokens is not None
- text_decoder_out, text_decoder_padding_mask = model.decode(
- seqs=prev_outputs_tokens.to(device),
- seq_lens=target_lengths.to(device),
- encoder_output=speech_encoder_out,
- encoder_padding_mask=speech_encoder_padding_mask,
- )
- text_logits = model.final_proj(text_decoder_out)
- text_logits = text_logits[:, ignore_prefix_size:, :]
- target_tokens = target_tokens[:, ignore_prefix_size:]
- loss = decoder_criterion(text_logits, target_tokens.to(device))
-
- assert loss.requires_grad == is_training
-
- info = MetricsTracker()
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
-
- # Note: We use reduction=sum while computing the loss.
- info["loss"] = loss.detach().cpu().item()
-
- return loss, info
-
-
-def compute_validation_loss(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- text_tokenizer_encoder: CharTokenizer,
- valid_dl: torch.utils.data.DataLoader,
- world_size: int = 1,
-) -> MetricsTracker:
- """Run the validation process."""
- model.eval()
-
- tot_loss = MetricsTracker()
-
- for batch_idx, batch in enumerate(valid_dl):
- loss, loss_info = compute_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- batch=batch,
- is_training=False,
- )
- assert loss.requires_grad is False
- tot_loss = tot_loss + loss_info
-
- if world_size > 1:
- tot_loss.reduce(loss.device)
-
- loss_value = tot_loss["loss"] / tot_loss["frames"]
- if loss_value < params.best_valid_loss:
- params.best_valid_epoch = params.cur_epoch
- params.best_valid_loss = loss_value
-
- return tot_loss
-
-
-def train_one_epoch(
- params: AttributeDict,
- model: Union[nn.Module, DDP],
- optimizer: torch.optim.Optimizer,
- scheduler: LRSchedulerType,
- text_tokenizer_encoder: CharTokenizer,
- train_dl: torch.utils.data.DataLoader,
- valid_dl: torch.utils.data.DataLoader,
- scaler: GradScaler,
- model_avg: Optional[nn.Module] = None,
- tb_writer: Optional[SummaryWriter] = None,
- world_size: int = 1,
- rank: int = 0,
-) -> None:
- """Train the model for one epoch.
-
- The training loss from the mean of all frames is saved in
- `params.train_loss`. It runs the validation process every
- `params.valid_interval` batches.
-
- Args:
- params:
- It is returned by :func:`get_params`.
- model:
- The model for training.
- optimizer:
- The optimizer we are using.
- scheduler:
- The learning rate scheduler, we call step() every step.
- train_dl:
- Dataloader for the training dataset.
- valid_dl:
- Dataloader for the validation dataset.
- scaler:
- The scaler used for mix precision training.
- model_avg:
- The stored model averaged from the start of training.
- tb_writer:
- Writer to write log messages to tensorboard.
- world_size:
- Number of nodes in DDP training. If it is 1, DDP is disabled.
- rank:
- The rank of the node in DDP training. If no DDP is used, it should
- be set to 0.
- """
- model.train()
-
- tot_loss = MetricsTracker()
-
- for batch_idx, batch in enumerate(train_dl):
- params.batch_idx_train += 1
- batch_size = len(batch["supervisions"]["text"])
-
- try:
- with torch.cuda.amp.autocast(enabled=params.use_fp16):
- loss, loss_info = compute_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- batch=batch,
- is_training=True,
- )
- # summary stats
- tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
-
- # NOTE: We use reduction==sum and loss is computed over utterances
- # in the batch and there is no normalization to it so far.
- scaler.scale(loss).backward()
- set_batch_count(model, params.batch_idx_train)
- scheduler.step_batch(params.batch_idx_train)
-
- scaler.step(optimizer)
- scaler.update()
- optimizer.zero_grad()
- except: # noqa
- display_and_save_batch(batch, params=params)
- raise
-
- if params.print_diagnostics and batch_idx == 5:
- return
-
- if (
- rank == 0
- and params.batch_idx_train > 0
- and params.batch_idx_train % params.average_period == 0
- ):
- update_averaged_model(
- params=params,
- model_cur=model,
- model_avg=model_avg,
- )
-
- if (
- params.batch_idx_train > 0
- and params.batch_idx_train % params.save_every_n == 0
- ):
- save_checkpoint_with_global_batch_idx(
- out_dir=params.exp_dir,
- global_batch_idx=params.batch_idx_train,
- model=model,
- model_avg=model_avg,
- params=params,
- optimizer=optimizer,
- scheduler=scheduler,
- sampler=train_dl.sampler,
- scaler=scaler,
- rank=rank,
- )
- remove_checkpoints(
- out_dir=params.exp_dir,
- topk=params.keep_last_k,
- rank=rank,
- )
-
- if batch_idx % 100 == 0 and params.use_fp16:
- # If the grad scale was less than 1, try increasing it. The _growth_interval
- # of the grad scaler is configurable, but we can't configure it to have different
- # behavior depending on the current grad scale.
- cur_grad_scale = scaler._scale.item()
- if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
- scaler.update(cur_grad_scale * 2.0)
- if cur_grad_scale < 0.01:
- logging.warning(f"Grad scale is small: {cur_grad_scale}")
- if cur_grad_scale < 1.0e-05:
- raise RuntimeError(
- f"grad_scale is too small, exiting: {cur_grad_scale}"
- )
- if batch_idx % params.log_interval == 0:
- cur_lr = scheduler.get_last_lr()[0]
- cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
-
- logging.info(
- f"Epoch {params.cur_epoch}, "
- f"batch {batch_idx}, loss[{loss_info}], "
- f"tot_loss[{tot_loss}], batch size: {batch_size}, "
- f"lr: {cur_lr:.2e}, "
- + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
- )
-
- if tb_writer is not None:
- tb_writer.add_scalar(
- "train/learning_rate", cur_lr, params.batch_idx_train
- )
-
- loss_info.write_summary(
- tb_writer, "train/current_", params.batch_idx_train
- )
- tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
- if params.use_fp16:
- tb_writer.add_scalar(
- "train/grad_scale",
- cur_grad_scale,
- params.batch_idx_train,
- )
-
- if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
- logging.info("Computing validation loss")
- valid_info = compute_validation_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- valid_dl=valid_dl,
- world_size=world_size,
- )
- model.train()
- logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
- logging.info(
- f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
- )
- if tb_writer is not None:
- valid_info.write_summary(
- tb_writer, "train/valid_", params.batch_idx_train
- )
-
- loss_value = tot_loss["loss"] / tot_loss["frames"]
- params.train_loss = loss_value
- if params.train_loss < params.best_train_loss:
- params.best_train_epoch = params.cur_epoch
- params.best_train_loss = params.train_loss
-
-def run(rank, world_size, args):
- """
- Args:
- rank:
- It is a value between 0 and `world_size-1`, which is
- passed automatically by `mp.spawn()` in :func:`main`.
- The node with rank 0 is responsible for saving checkpoint.
- world_size:
- Number of GPUs for DDP training.
- args:
- The return value of get_parser().parse_args()
- """
- params = get_params()
- params.update(vars(args))
-
- fix_random_seed(params.seed)
- # rank = get_rank()
- # world_size = get_world_size()
- # setup_dist(rank, world_size, use_ddp_launch=True)
- setup_dist(use_ddp_launch=True)
-
- setup_logger(f"{params.exp_dir}/log/log-train")
- logging.info("Training started")
-
- if args.tensorboard and rank == 0:
- tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
- else:
- tb_writer = None
-
- device = torch.device("cpu")
- if torch.cuda.is_available():
- device = torch.device("cuda", rank)
- logging.info(f"Device: {device}")
-
-
-
-
- logging.info("About to create model")
- model_name_or_card = "seamlessM4T_medium"
- tokenizer_file = "./seamlessm4t/tokens.txt"
- lang = "cmn"
-
- # text_tokenizer = load_unity_text_tokenizer(model_name_or_card)
- # text_tokenizer_encoder = SentencePieceEncoder(
- # text_tokenizer.model,
- # prefix_tokens=["", f"__{lang}__"],
- # suffix_tokens=[""],
- # )
- # #params.eos_idx = text_tokenizer.model.eos_idx
- # params.pad_idx = text_tokenizer.model.pad_idx
- text_tokenizer_encoder = CharTokenizer(tokenizer_file)
- params.pad_idx, params.eos_idx = 0, 1
- logging.info(params)
-
- model = load_unity_model(model_name_or_card, device="cpu", dtype=torch.float32)
- del model.t2u_model
- del model.text_encoder
- del model.text_encoder_frontend
- model.text_decoder_frontend.embed = nn.Embedding(num_embeddings=text_tokenizer_encoder.vocab_size, embedding_dim=1024 ,padding_idx=0)
- #model.text_decoder_frontend.embed = Embedding(num_embeddings=text_tokenizer_encoder.vocab_size, embedding_dim=1024 ,pad_idx=0, scaled=True)
- #model.final_proj = TiedProjection(input_dim=1024, output_dim=text_tokenizer_encoder.vocab_size)
- model.final_proj = nn.Linear(1024, text_tokenizer_encoder.vocab_size, bias=False)
- for name, param in model.named_parameters():
- if name != 'text_decoder_frontend.embed.weight' and name != 'final_proj.weight':
- #param.requires_grad = False
- pass
- model.text_decoder_frontend.embed.requires_grad = True
- model.final_proj.requires_grad = True
- print(model.text_decoder_frontend.embed.requires_grad, model.final_proj.requires_grad)
- for param in model.parameters():
- if param.requires_grad:
- print(233333333333333333333333333333333333333333333333333333333333333333333)
- for name, param in model.named_parameters():
- print(name, param.requires_grad)
- #exit(0)
-
- num_param = sum([p.numel() for p in model.parameters()])
- logging.info(f"Number of model parameters: {num_param}")
-
- assert params.save_every_n >= params.average_period
- model_avg: Optional[nn.Module] = None
- if rank == 0:
- # model_avg is only used with rank 0
- model_avg = copy.deepcopy(model).to(torch.float64)
-
- assert params.start_epoch > 0, params.start_epoch
- checkpoints = load_checkpoint_if_available(
- params=params, model=model, model_avg=model_avg
- )
-
- model.to(device)
- if world_size > 1:
- logging.info("Using DDP")
- model = DDP(model, device_ids=[rank], find_unused_parameters=True)
-
- #parameters_names = []
- #parameters_names.append(
- # [name_param_pair[0] for name_param_pair in model.named_parameters()]
- #)
- # optimizer = ScaledAdam(
- # model.parameters(),
- # lr=params.base_lr,
- # clipping_scale=2.0,
- # parameters_names=parameters_names,
- # )
- optimizer = ScaledAdam(
- model.parameters(),
- lr=params.base_lr,
- clipping_scale=2.0,
- )
- scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
-
- if checkpoints and "optimizer" in checkpoints:
- logging.info("Loading optimizer state dict")
- optimizer.load_state_dict(checkpoints["optimizer"])
-
- if (
- checkpoints
- and "scheduler" in checkpoints
- and checkpoints["scheduler"] is not None
- ):
- logging.info("Loading scheduler state dict")
- scheduler.load_state_dict(checkpoints["scheduler"])
-
- if params.print_diagnostics:
- opts = diagnostics.TensorDiagnosticOptions(
- 2**22
- ) # allow 4 megabytes per sub-module
- diagnostic = diagnostics.attach_diagnostics(model, opts)
-
- if params.inf_check:
- register_inf_check_hooks(model)
-
- def remove_short_and_long_utt(c: Cut):
- # Keep only utterances with duration between 1 second and 20 seconds
- #
- # Caution: There is a reason to select 20.0 here. Please see
- # ../local/display_manifest_statistics.py
- #
- # You should use ../local/display_manifest_statistics.py to get
- # an utterance duration distribution for your dataset to select
- # the threshold
- if c.duration < 1.0 or c.duration > 12.0:
- logging.warning(
- f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
- )
- return False
-
- # In pruned RNN-T, we require that T >= S
- # where T is the number of feature frames after subsampling
- # and S is the number of tokens in the utterance
-
- # In ./zipformer.py, the conv module uses the following expression
- # for subsampling
- # T = ((c.num_frames - 7) // 2 + 1) // 2
- # tokens = sp.encode(c.supervisions[0].text, out_type=str)
-
- # if T < len(tokens):
- # logging.warning(
- # f"Exclude cut with ID {c.id} from training. "
- # f"Number of frames (before subsampling): {c.num_frames}. "
- # f"Number of frames (after subsampling): {T}. "
- # f"Text: {c.supervisions[0].text}. "
- # f"Tokens: {tokens}. "
- # f"Number of tokens: {len(tokens)}"
- # )
- # return False
-
- return True
-
- #aishell = AIShell(manifest_dir=args.manifest_dir)
- #train_cuts = aishell.train_cuts()
- #asr_datamodule = AishellAsrDataModule(args)
-
- aishell = AishellAsrDataModule(args)
- # train_cuts = asr_datamodule.train_cuts()
- # train_cuts = train_cuts.filter(remove_short_and_long_utt)
-
- # if args.enable_musan:
- # cuts_musan = load_manifest(Path(args.manifest_dir) / "musan_cuts.jsonl.gz")
- # else:
- # cuts_musan = None
-
-
-
- if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
- # We only load the sampler's state dict when it loads a checkpoint
- # saved in the middle of an epoch
- sampler_state_dict = checkpoints["sampler"]
- else:
- sampler_state_dict = None
-
- # train_dl = asr_datamodule.train_dataloaders(
- # train_cuts,
- # on_the_fly_feats=False,
- # cuts_musan=cuts_musan,
- # sampler_state_dict=sampler_state_dict,
- # )
-
- # valid_cuts = aishell.valid_cuts()
- # valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
- train_dl = aishell.train_dataloaders(aishell.train_cuts())
- valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
- # if not params.print_diagnostics:
- # scan_pessimistic_batches_for_oom(
- # model=model,
- # train_dl=train_dl,
- # optimizer=optimizer,
- # graph_compiler=graph_compiler,
- # params=params,
- # )
-
- scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
- if checkpoints and "grad_scaler" in checkpoints:
- logging.info("Loading grad scaler state dict")
- scaler.load_state_dict(checkpoints["grad_scaler"])
-
- logging.info(f"start training from epoch {params.start_epoch}")
- for epoch in range(params.start_epoch, params.num_epochs + 1):
- scheduler.step_epoch(epoch - 1)
- fix_random_seed(params.seed + epoch - 1)
- train_dl.sampler.set_epoch(epoch - 1)
-
- if tb_writer is not None:
- tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
-
- params.cur_epoch = epoch
-
- train_one_epoch(
- params=params,
- model=model,
- model_avg=model_avg,
- optimizer=optimizer,
- scheduler=scheduler,
- text_tokenizer_encoder=text_tokenizer_encoder,
- train_dl=train_dl,
- valid_dl=valid_dl,
- scaler=scaler,
- tb_writer=tb_writer,
- world_size=world_size,
- rank=rank,
- )
-
- if params.print_diagnostics:
- diagnostic.print_diagnostics()
- break
-
- save_checkpoint(
- params=params,
- model=model,
- model_avg=model_avg,
- optimizer=optimizer,
- scheduler=scheduler,
- sampler=train_dl.sampler,
- scaler=scaler,
- rank=rank,
- )
-
- logging.info("Done!")
-
- if world_size > 1:
- torch.distributed.barrier()
- cleanup_dist()
-
-
-def display_and_save_batch(
- batch: dict,
- params: AttributeDict,
-) -> None:
- """Display the batch statistics and save the batch into disk.
-
- Args:
- batch:
- A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
- for the content in it.
- params:
- Parameters for training. See :func:`get_params`.
- """
- from lhotse.utils import uuid4
-
- filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
- logging.info(f"Saving batch to {filename}")
- torch.save(batch, filename)
-
- supervisions = batch["supervisions"]
- features = batch["inputs"]
-
- logging.info(f"features shape: {features.shape}")
-
- # y = graph_compiler.texts_to_ids(supervisions["text"])
- # num_tokens = sum(len(i) for i in y)
- # logging.info(f"num tokens: {num_tokens}")
-
-
-def scan_pessimistic_batches_for_oom(
- model: Union[nn.Module, DDP],
- train_dl: torch.utils.data.DataLoader,
- optimizer: torch.optim.Optimizer,
- params: AttributeDict,
- text_tokenizer_encoder: CharTokenizer,
-):
- from lhotse.dataset import find_pessimistic_batches
-
- logging.info(
- "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
- )
- batches, crit_values = find_pessimistic_batches(train_dl.sampler)
- for criterion, cuts in batches.items():
- batch = train_dl.dataset[cuts]
- try:
- with torch.cuda.amp.autocast(enabled=params.use_fp16):
- loss, _ = compute_loss(
- params=params,
- model=model,
- text_tokenizer_encoder=text_tokenizer_encoder,
- batch=batch,
- is_training=True,
- )
- loss.backward()
- optimizer.zero_grad()
- except Exception as e:
- if "CUDA out of memory" in str(e):
- logging.error(
- "Your GPU ran out of memory with the current "
- "max_duration setting. We recommend decreasing "
- "max_duration and trying again.\n"
- f"Failing criterion: {criterion} "
- f"(={crit_values[criterion]}) ..."
- )
- display_and_save_batch(batch, params=params)
- raise
- logging.info(
- f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
- )
-
-
-def main():
- parser = get_parser()
- AishellAsrDataModule.add_arguments(parser)
- args = parser.parse_args()
- args.exp_dir = Path(args.exp_dir)
-
- world_size = get_world_size()
- rank = get_rank()
- assert world_size >= 1
-
- run(rank=rank, world_size=world_size, args=args)
-
-
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-if __name__ == "__main__":
- main()
diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
index df21a9508..efb32336a 100644
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -30,7 +30,7 @@ from lhotse.dataset import (
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
- SimpleCutSampler,
+ SingleCutSampler,
SpecAugment,
)
from lhotse.dataset.input_strategies import OnTheFlyFeatures
@@ -176,13 +176,13 @@ class AishellAsrDataModule:
group.add_argument(
"--enable-musan",
type=str2bool,
- default=False,
+ default=True,
help="When enabled, select noise from MUSAN and mix it"
"with training dataset. ",
)
def train_dataloaders(
- self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None, rank = None, world_size = None
+ self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None
) -> DataLoader:
"""
Args:
@@ -192,13 +192,13 @@ class AishellAsrDataModule:
The state dict for the training sampler.
"""
logging.info("About to get Musan cuts")
+ cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
transforms = []
if self.args.enable_musan:
logging.info("Enable MUSAN")
- cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
transforms.append(
- CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
+ CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
)
else:
logging.info("Disable MUSAN")
@@ -276,12 +276,10 @@ class AishellAsrDataModule:
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
drop_last=self.args.drop_last,
- world_size=world_size,
- rank=rank,
)
else:
- logging.info("Using SimpleCutSampler.")
- train_sampler = SimpleCutSampler(
+ logging.info("Using SingleCutSampler.")
+ train_sampler = SingleCutSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
@@ -302,7 +300,7 @@ class AishellAsrDataModule:
return train_dl
- def valid_dataloaders(self, cuts_valid: CutSet, rank = None, world_size = None) -> DataLoader:
+ def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
transforms = []
if self.args.concatenate_cuts:
transforms = [
@@ -327,8 +325,6 @@ class AishellAsrDataModule:
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
- rank=rank,
- world_size=world_size,
)
logging.info("About to create dev dataloader")
valid_dl = DataLoader(
diff --git a/egs/aishell/ASR/whisper/decode.py b/egs/aishell/ASR/whisper/decode.py
index 34dae7a85..371350905 100644
--- a/egs/aishell/ASR/whisper/decode.py
+++ b/egs/aishell/ASR/whisper/decode.py
@@ -473,10 +473,11 @@ def main():
aishell = AishellAsrDataModule(args)
test_cuts = aishell.test_cuts()
test_dl = aishell.test_dataloaders(test_cuts)
-
- test_sets = ["test"]
- test_dls = [test_dl]
-
+ valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
+ #test_sets = ["test"]
+ #test_dls = [test_dl]
+ test_sets = ["valid"]
+ test_dls = [valid_dl]
for test_set, test_dl in zip(test_sets, test_dls):
results_dict = decode_dataset(
dl=test_dl,
diff --git a/egs/aishell/ASR/whisper/ds_config_zero1.json b/egs/aishell/ASR/whisper/ds_config_zero1.json
index cd8cbac8e..b95b1cee4 100644
--- a/egs/aishell/ASR/whisper/ds_config_zero1.json
+++ b/egs/aishell/ASR/whisper/ds_config_zero1.json
@@ -27,7 +27,7 @@
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 1e-5,
- "warmup_num_steps": 1000
+ "warmup_num_steps": 100
}
},
"gradient_accumulation_steps": 1,
diff --git a/egs/aishell/ASR/whisper/train.py b/egs/aishell/ASR/whisper/train.py
index 932242ddb..6c76d3cff 100644
--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@@ -126,7 +126,7 @@ def get_parser():
parser.add_argument(
"--num-epochs",
type=int,
- default=5,
+ default=10,
help="Number of epochs to train.",
)