diff --git a/.flake8 b/.flake8 index 229cf1d6c..dd9239b2d 100644 --- a/.flake8 +++ b/.flake8 @@ -13,4 +13,5 @@ per-file-ignores = exclude = .git, **/data/**, - icefall/shared/make_kn_lm.py + icefall/shared/make_kn_lm.py, + icefall/__init__.py diff --git a/egs/aishell/ASR/conformer_ctc/label_smoothing.py b/egs/aishell/ASR/conformer_ctc/label_smoothing.py deleted file mode 100644 index cdc85ce9a..000000000 --- a/egs/aishell/ASR/conformer_ctc/label_smoothing.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class LabelSmoothingLoss(torch.nn.Module): - """ - Implement the LabelSmoothingLoss proposed in the following paper - https://arxiv.org/pdf/1512.00567.pdf - (Rethinking the Inception Architecture for Computer Vision) - - """ - - def __init__( - self, - ignore_index: int = -1, - label_smoothing: float = 0.1, - reduction: str = "sum", - ) -> None: - """ - Args: - ignore_index: - ignored class id - label_smoothing: - smoothing rate (0.0 means the conventional cross entropy loss) - reduction: - It has the same meaning as the reduction in - `torch.nn.CrossEntropyLoss`. It can be one of the following three - values: (1) "none": No reduction will be applied. (2) "mean": the - mean of the output is taken. (3) "sum": the output will be summed. - """ - super().__init__() - assert 0.0 <= label_smoothing < 1.0 - self.ignore_index = ignore_index - self.label_smoothing = label_smoothing - self.reduction = reduction - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """ - Compute loss between x and target. - - Args: - x: - prediction of dimension - (batch_size, input_length, number_of_classes). - target: - target masked with self.ignore_index of - dimension (batch_size, input_length). - - Returns: - A scalar tensor containing the loss without normalization. - """ - assert x.ndim == 3 - assert target.ndim == 2 - assert x.shape[:2] == target.shape - num_classes = x.size(-1) - x = x.reshape(-1, num_classes) - # Now x is of shape (N*T, C) - - # We don't want to change target in-place below, - # so we make a copy of it here - target = target.clone().reshape(-1) - - ignored = target == self.ignore_index - target[ignored] = 0 - - true_dist = torch.nn.functional.one_hot( - target, num_classes=num_classes - ).to(x) - - true_dist = ( - true_dist * (1 - self.label_smoothing) - + self.label_smoothing / num_classes - ) - # Set the value of ignored indexes to 0 - true_dist[ignored] = 0 - - loss = -1 * (torch.log_softmax(x, dim=1) * true_dist) - if self.reduction == "sum": - return loss.sum() - elif self.reduction == "mean": - return loss.sum() / (~ignored).sum() - else: - return loss.sum(dim=-1) diff --git a/egs/aishell/ASR/conformer_ctc/label_smoothing.py b/egs/aishell/ASR/conformer_ctc/label_smoothing.py new file mode 120000 index 000000000..e9d239fff --- /dev/null +++ b/egs/aishell/ASR/conformer_ctc/label_smoothing.py @@ -0,0 +1 @@ +../../../librispeech/ASR/conformer_ctc/label_smoothing.py \ No newline at end of file diff --git a/egs/aishell/ASR/conformer_mmi/label_smoothing.py b/egs/aishell/ASR/conformer_mmi/label_smoothing.py deleted file mode 100644 index cdc85ce9a..000000000 --- a/egs/aishell/ASR/conformer_mmi/label_smoothing.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class LabelSmoothingLoss(torch.nn.Module): - """ - Implement the LabelSmoothingLoss proposed in the following paper - https://arxiv.org/pdf/1512.00567.pdf - (Rethinking the Inception Architecture for Computer Vision) - - """ - - def __init__( - self, - ignore_index: int = -1, - label_smoothing: float = 0.1, - reduction: str = "sum", - ) -> None: - """ - Args: - ignore_index: - ignored class id - label_smoothing: - smoothing rate (0.0 means the conventional cross entropy loss) - reduction: - It has the same meaning as the reduction in - `torch.nn.CrossEntropyLoss`. It can be one of the following three - values: (1) "none": No reduction will be applied. (2) "mean": the - mean of the output is taken. (3) "sum": the output will be summed. - """ - super().__init__() - assert 0.0 <= label_smoothing < 1.0 - self.ignore_index = ignore_index - self.label_smoothing = label_smoothing - self.reduction = reduction - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """ - Compute loss between x and target. - - Args: - x: - prediction of dimension - (batch_size, input_length, number_of_classes). - target: - target masked with self.ignore_index of - dimension (batch_size, input_length). - - Returns: - A scalar tensor containing the loss without normalization. - """ - assert x.ndim == 3 - assert target.ndim == 2 - assert x.shape[:2] == target.shape - num_classes = x.size(-1) - x = x.reshape(-1, num_classes) - # Now x is of shape (N*T, C) - - # We don't want to change target in-place below, - # so we make a copy of it here - target = target.clone().reshape(-1) - - ignored = target == self.ignore_index - target[ignored] = 0 - - true_dist = torch.nn.functional.one_hot( - target, num_classes=num_classes - ).to(x) - - true_dist = ( - true_dist * (1 - self.label_smoothing) - + self.label_smoothing / num_classes - ) - # Set the value of ignored indexes to 0 - true_dist[ignored] = 0 - - loss = -1 * (torch.log_softmax(x, dim=1) * true_dist) - if self.reduction == "sum": - return loss.sum() - elif self.reduction == "mean": - return loss.sum() / (~ignored).sum() - else: - return loss.sum(dim=-1) diff --git a/egs/aishell/ASR/conformer_mmi/label_smoothing.py b/egs/aishell/ASR/conformer_mmi/label_smoothing.py new file mode 120000 index 000000000..08734abd7 --- /dev/null +++ b/egs/aishell/ASR/conformer_mmi/label_smoothing.py @@ -0,0 +1 @@ +../conformer_ctc/label_smoothing.py \ No newline at end of file diff --git a/egs/aishell/ASR/prepare.sh b/egs/aishell/ASR/prepare.sh index 68f5c54d3..26324b0af 100755 --- a/egs/aishell/ASR/prepare.sh +++ b/egs/aishell/ASR/prepare.sh @@ -70,7 +70,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then # |-- lexicon.txt # `-- speaker.info - if [ ! -d $dl_dir/aishell/data_aishell/wav ]; then + if [ ! -d $dl_dir/aishell/data_aishell/wav/train ]; then lhotse download aishell $dl_dir fi diff --git a/egs/librispeech/ASR/conformer_ctc/label_smoothing.py b/egs/librispeech/ASR/conformer_ctc/label_smoothing.py index cdc85ce9a..1f2f3b137 100644 --- a/egs/librispeech/ASR/conformer_ctc/label_smoothing.py +++ b/egs/librispeech/ASR/conformer_ctc/label_smoothing.py @@ -76,7 +76,11 @@ class LabelSmoothingLoss(torch.nn.Module): target = target.clone().reshape(-1) ignored = target == self.ignore_index - target[ignored] = 0 + + # See https://github.com/k2-fsa/icefall/issues/240 + # and https://github.com/k2-fsa/icefall/issues/297 + # for why we don't use target[ignored] = 0 here + target = torch.where(ignored, torch.zeros_like(target), target) true_dist = torch.nn.functional.one_hot( target, num_classes=num_classes @@ -86,8 +90,17 @@ class LabelSmoothingLoss(torch.nn.Module): true_dist * (1 - self.label_smoothing) + self.label_smoothing / num_classes ) + # Set the value of ignored indexes to 0 - true_dist[ignored] = 0 + # + # See https://github.com/k2-fsa/icefall/issues/240 + # and https://github.com/k2-fsa/icefall/issues/297 + # for why we don't use true_dist[ignored] = 0 here + true_dist = torch.where( + ignored.unsqueeze(1).repeat(1, true_dist.shape[1]), + torch.zeros_like(true_dist), + true_dist, + ) loss = -1 * (torch.log_softmax(x, dim=1) * true_dist) if self.reduction == "sum": diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py index 8e924bf96..49b1308b0 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py @@ -98,27 +98,28 @@ def get_parser(): "--epoch", type=int, default=28, - help="It specifies the checkpoint to use for decoding." - "Note: Epoch counts from 0.", + help="""It specifies the checkpoint to use for decoding. + Note: Epoch counts from 0. + You can specify --avg to use more checkpoints for model averaging.""", ) + + parser.add_argument( + "--iter", + type=int, + default=0, + help="""If positive, --epoch is ignored and it + will use the checkpoint exp_dir/checkpoint-iter.pt. + You can specify --avg to use more checkpoints for model averaging. + """, + ) + parser.add_argument( "--avg", type=int, default=15, help="Number of checkpoints to average. Automatically select " "consecutive checkpoints before the checkpoint specified by " - "'--epoch'. ", - ) - - parser.add_argument( - "--avg-last-n", - type=int, - default=0, - help="""If positive, --epoch and --avg are ignored and it - will use the last n checkpoints exp_dir/checkpoint-xxx.pt - where xxx is the number of processed batches while - saving that checkpoint. - """, + "'--epoch' and '--iter'", ) parser.add_argument( @@ -453,13 +454,19 @@ def main(): ) params.res_dir = params.exp_dir / params.decoding_method - params.suffix = f"epoch-{params.epoch}-avg-{params.avg}" + if params.iter > 0: + params.suffix = f"iter-{params.iter}-avg-{params.avg}" + else: + params.suffix = f"epoch-{params.epoch}-avg-{params.avg}" + if "fast_beam_search" in params.decoding_method: params.suffix += f"-beam-{params.beam}" params.suffix += f"-max-contexts-{params.max_contexts}" params.suffix += f"-max-states-{params.max_states}" elif "beam_search" in params.decoding_method: - params.suffix += f"-beam-{params.beam_size}" + params.suffix += ( + f"-{params.decoding_method}-beam-size-{params.beam_size}" + ) else: params.suffix += f"-context-{params.context_size}" params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}" @@ -485,8 +492,20 @@ def main(): logging.info("About to create model") model = get_transducer_model(params) - if params.avg_last_n > 0: - filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n] + if params.iter > 0: + filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[ + : params.avg + ] + if len(filenames) == 0: + raise ValueError( + f"No checkpoints found for" + f" --iter {params.iter}, --avg {params.avg}" + ) + elif len(filenames) < params.avg: + raise ValueError( + f"Not enough checkpoints ({len(filenames)}) found for" + f" --iter {params.iter}, --avg {params.avg}" + ) logging.info(f"averaging {filenames}") model.to(device) model.load_state_dict(average_checkpoints(filenames, device=device)) diff --git a/icefall/__init__.py b/icefall/__init__.py index e69de29bb..f466d6a62 100644 --- a/icefall/__init__.py +++ b/icefall/__init__.py @@ -0,0 +1,55 @@ +from .checkpoint import ( + average_checkpoints, + find_checkpoints, + load_checkpoint, + remove_checkpoints, + save_checkpoint, + save_checkpoint_with_global_batch_idx, +) + +from .decode import ( + get_lattice, + nbest_decoding, + nbest_oracle, + one_best_decoding, + rescore_with_attention_decoder, + rescore_with_n_best_list, + rescore_with_whole_lattice, +) + +from .dist import ( + cleanup_dist, + setup_dist, +) + +from .env import ( + get_env_info, + get_git_branch_name, + get_git_date, + get_git_sha1, +) + +from .utils import ( + AttributeDict, + MetricsTracker, + add_eos, + add_sos, + concat, + encode_supervisions, + get_alignments, + get_executor, + get_texts, + l1_norm, + l2_norm, + linf_norm, + load_alignments, + make_pad_mask, + measure_gradient_norms, + measure_weight_norms, + optim_step_and_measure_param_change, + save_alignments, + setup_logger, + store_transcripts, + str2bool, + write_error_stats, +) diff --git a/icefall/checkpoint.py b/icefall/checkpoint.py index 251456c95..1ef05d964 100644 --- a/icefall/checkpoint.py +++ b/icefall/checkpoint.py @@ -216,27 +216,62 @@ def save_checkpoint_with_global_batch_idx( ) -def find_checkpoints(out_dir: Path) -> List[str]: +def find_checkpoints(out_dir: Path, iteration: int = 0) -> List[str]: """Find all available checkpoints in a directory. The checkpoint filenames have the form: `checkpoint-xxx.pt` where xxx is a numerical value. + Assume you have the following checkpoints in the folder `foo`: + + - checkpoint-1.pt + - checkpoint-20.pt + - checkpoint-300.pt + - checkpoint-4000.pt + + Case 1 (Return all checkpoints):: + + find_checkpoints(out_dir='foo') + + Case 2 (Return checkpoints newer than checkpoint-20.pt, i.e., + checkpoint-4000.pt, checkpoint-300.pt, and checkpoint-20.pt) + + find_checkpoints(out_dir='foo', iteration=20) + + Case 3 (Return checkpoints older than checkpoint-20.pt, i.e., + checkpoint-20.pt, checkpoint-1.pt):: + + find_checkpoints(out_dir='foo', iteration=-20) + Args: out_dir: The directory where to search for checkpoints. + iteration: + If it is 0, return all available checkpoints. + If it is positive, return the checkpoints whose iteration number is + greater than or equal to `iteration`. + If it is negative, return the checkpoints whose iteration number is + less than or equal to `-iteration`. Returns: Return a list of checkpoint filenames, sorted in descending order by the numerical value in the filename. """ checkpoints = list(glob.glob(f"{out_dir}/checkpoint-[0-9]*.pt")) pattern = re.compile(r"checkpoint-([0-9]+).pt") - idx_checkpoints = [ + iter_checkpoints = [ (int(pattern.search(c).group(1)), c) for c in checkpoints ] + # iter_checkpoints is a list of tuples. Each tuple contains + # two elements: (iteration_number, checkpoint-iteration_number.pt) + + iter_checkpoints = sorted( + iter_checkpoints, reverse=True, key=lambda x: x[0] + ) + if iteration >= 0: + ans = [ic[1] for ic in iter_checkpoints if ic[0] >= iteration] + else: + ans = [ic[1] for ic in iter_checkpoints if ic[0] <= -iteration] - idx_checkpoints = sorted(idx_checkpoints, reverse=True, key=lambda x: x[0]) - ans = [ic[1] for ic in idx_checkpoints] return ans diff --git a/icefall/env.py b/icefall/env.py index 0684c4bf1..c29cbb078 100644 --- a/icefall/env.py +++ b/icefall/env.py @@ -95,6 +95,7 @@ def get_env_info() -> Dict[str, Any]: "k2-git-sha1": k2.version.__git_sha1__, "k2-git-date": k2.version.__git_date__, "lhotse-version": lhotse.__version__, + "torch-version": torch.__version__, "torch-cuda-available": torch.cuda.is_available(), "torch-cuda-version": torch.version.cuda, "python-version": sys.version[:3], diff --git a/pyproject.toml b/pyproject.toml index 01ff869db..ec5623f90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,6 @@ [tool.isort] profile = "black" +skip = ["icefall/__init__.py"] [tool.black] line-length = 80