New label smoothing (#109)

* Modify label smoothing to match the one implemented in PyTorch. * Enable CI for torch 1.10 * Fix CI errors. * Fix CI installation errors. * Fix CI installation errors. * Minor fixes. * Minor fixes. * Minor fixes. * Minor fixes. * Minor fixes. * Fix CI errors.
2025-08-08 09:32:20 +00:00 · 2021-11-17 19:24:07 +08:00 · 2021-11-17 19:24:07 +08:00 · 336283f872
commit 336283f872
parent 10e46f3e1d
6 changed files with 185 additions and 78 deletions
--- a/.github/workflows/run-pretrained.yml
+++ b/.github/workflows/run-pretrained.yml
@ -31,8 +31,9 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.6, 3.7, 3.8, 3.9]
-        torch: ["1.8.1"]
-        k2-version: ["1.9.dev20210919"]
+        torch: ["1.10.0"]
+        torchaudio: ["0.10.0"]
+        k2-version: ["1.9.dev20211101"]

      fail-fast: false

@ -49,7 +50,9 @@ jobs:
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
-          pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+          # numpy 1.20.x does not support python 3.6
+          pip install numpy==1.19
+          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/

          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@ -33,8 +33,9 @@ jobs:
        # TODO: enable macOS for CPU testing
        os: [ubuntu-18.04]
        python-version: [3.8]
-        torch: ["1.8.1"]
-        k2-version: ["1.9.dev20210919"]
+        torch: ["1.10.0"]
+        torchaudio: ["0.10.0"]
+        k2-version: ["1.9.dev20211101"]
      fail-fast: false

    steps:
@ -57,6 +58,7 @@ jobs:
      - name: Install Python dependencies
        run: |
          python3 -m pip install -U pip
+          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse

--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -33,8 +33,14 @@ jobs:
        # disable macOS test for now.
        os: [ubuntu-18.04]
        python-version: [3.6, 3.7, 3.8, 3.9]
-        torch: ["1.8.1"]
-        k2-version: ["1.9.dev20210919"]
+        torch: ["1.8.0", "1.10.0"]
+        torchaudio: ["0.8.0", "0.10.0"]
+        k2-version: ["1.9.dev20211101"]
+        exclude:
+          - torch: "1.8.0"
+            torchaudio: "0.10.0"
+          - torch: "1.10.0"
+            torchaudio: "0.8.0"

      fail-fast: false

@ -58,6 +64,15 @@ jobs:
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
+          # numpy 1.20.x does not support python 3.6
+          pip install numpy==1.19
+          pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
+            pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+          else
+            pip install torchaudio==${{ matrix.torchaudio }}
+          fi
+
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          pip install git+https://github.com/lhotse-speech/lhotse
          # icefall requirements
@ -83,7 +98,10 @@ jobs:
          ls -lh
          export PYTHONPATH=$PWD:$PWD/lhotse:$PYTHONPATH
          echo $PYTHONPATH
-          pytest ./test
+          pytest -v -s ./test
+          # runt tests for conformer ctc
+          cd egs/librispeech/ASR/conformer_ctc
+          pytest -v -s

      - name: Run tests
        if: startsWith(matrix.os, 'macos')
@ -93,8 +111,8 @@ jobs:
          lib_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
          echo "lib_path: $lib_path"
          export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH
-          pytest ./test
+          pytest -v -s ./test

          # runt tests for conformer ctc
          cd egs/librispeech/ASR/conformer_ctc
-          pytest
+          pytest -v -s
--- a/egs/librispeech/ASR/conformer_ctc/label_smoothing.py
+++ b/egs/librispeech/ASR/conformer_ctc/label_smoothing.py
@ -0,0 +1,98 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+class LabelSmoothingLoss(torch.nn.Module):
+    """
+    Implement the LabelSmoothingLoss proposed in the following paper
+    https://arxiv.org/pdf/1512.00567.pdf
+    (Rethinking the Inception Architecture for Computer Vision)
+
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -1,
+        label_smoothing: float = 0.1,
+        reduction: str = "sum",
+    ) -> None:
+        """
+        Args:
+          ignore_index:
+            ignored class id
+          label_smoothing:
+            smoothing rate (0.0 means the conventional cross entropy loss)
+          reduction:
+            It has the same meaning as the reduction in
+            `torch.nn.CrossEntropyLoss`. It can be one of the following three
+            values: (1) "none": No reduction will be applied. (2) "mean": the
+            mean of the output is taken. (3) "sum": the output will be summed.
+        """
+        super().__init__()
+        assert 0.0 <= label_smoothing < 1.0
+        self.ignore_index = ignore_index
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """
+        Compute loss between x and target.
+
+        Args:
+          x:
+            prediction of dimension
+            (batch_size, input_length, number_of_classes).
+          target:
+            target masked with self.ignore_index of
+            dimension (batch_size, input_length).
+
+        Returns:
+          A scalar tensor containing the loss without normalization.
+        """
+        assert x.ndim == 3
+        assert target.ndim == 2
+        assert x.shape[:2] == target.shape
+        num_classes = x.size(-1)
+        x = x.reshape(-1, num_classes)
+        # Now x is of shape (N*T, C)
+
+        # We don't want to change target in-place below,
+        # so we make a copy of it here
+        target = target.clone().reshape(-1)
+
+        ignored = target == self.ignore_index
+        target[ignored] = 0
+
+        true_dist = torch.nn.functional.one_hot(
+            target, num_classes=num_classes
+        ).to(x)
+
+        true_dist = (
+            true_dist * (1 - self.label_smoothing)
+            + self.label_smoothing / num_classes
+        )
+        # Set the value of ignored indexes to 0
+        true_dist[ignored] = 0
+
+        loss = -1 * (torch.log_softmax(x, dim=1) * true_dist)
+        if self.reduction == "sum":
+            return loss.sum()
+        elif self.reduction == "mean":
+            return loss.sum() / (~ignored).sum()
+        else:
+            return loss.sum(dim=-1)
--- a/egs/librispeech/ASR/conformer_ctc/test_label_smoothing.py
+++ b/egs/librispeech/ASR/conformer_ctc/test_label_smoothing.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from distutils.version import LooseVersion
+
+import torch
+from label_smoothing import LabelSmoothingLoss
+
+torch_ver = LooseVersion(torch.__version__)
+
+
+def test_with_torch_label_smoothing_loss():
+    if torch_ver < LooseVersion("1.10.0"):
+        print(f"Current torch version: {torch_ver}")
+        print("Please use torch >= 1.10 to run this test - skipping")
+        return
+    torch.manual_seed(20211105)
+    x = torch.rand(20, 30, 5000)
+    tgt = torch.randint(low=-1, high=x.size(-1), size=x.shape[:2])
+    for reduction in ["none", "sum", "mean"]:
+        custom_loss_func = LabelSmoothingLoss(
+            ignore_index=-1, label_smoothing=0.1, reduction=reduction
+        )
+        custom_loss = custom_loss_func(x, tgt)
+
+        torch_loss_func = torch.nn.CrossEntropyLoss(
+            ignore_index=-1, reduction=reduction, label_smoothing=0.1
+        )
+        torch_loss = torch_loss_func(x.reshape(-1, x.size(-1)), tgt.reshape(-1))
+        assert torch.allclose(custom_loss, torch_loss)
+
+
+def main():
+    test_with_torch_label_smoothing_loss()
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/librispeech/ASR/conformer_ctc/transformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/transformer.py
@ -20,6 +20,7 @@ from typing import Dict, List, Optional, Tuple

 import torch
 import torch.nn as nn
+from label_smoothing import LabelSmoothingLoss
 from subsampling import Conv2dSubsampling, VggSubsampling
 from torch.nn.utils.rnn import pad_sequence

@ -152,7 +153,7 @@ class Transformer(nn.Module):
                d_model, self.decoder_num_class
            )

-            self.decoder_criterion = LabelSmoothingLoss(self.decoder_num_class)
+            self.decoder_criterion = LabelSmoothingLoss()
        else:
            self.decoder_criterion = None

@ -799,73 +800,6 @@ class Noam(object):
                setattr(self, key, value)


-class LabelSmoothingLoss(nn.Module):
-    """
-    Label-smoothing loss. KL-divergence between
-    q_{smoothed ground truth prob.}(w)
-    and p_{prob. computed by model}(w) is minimized.
-    Modified from
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/label_smoothing_loss.py  # noqa
-
-    Args:
-        size: the number of class
-        padding_idx: padding_idx: ignored class id
-        smoothing: smoothing rate (0.0 means the conventional CE)
-        normalize_length: normalize loss by sequence length if True
-        criterion: loss function to be smoothed
-    """
-
-    def __init__(
-        self,
-        size: int,
-        padding_idx: int = -1,
-        smoothing: float = 0.1,
-        normalize_length: bool = False,
-        criterion: nn.Module = nn.KLDivLoss(reduction="none"),
-    ) -> None:
-        """Construct an LabelSmoothingLoss object."""
-        super(LabelSmoothingLoss, self).__init__()
-        self.criterion = criterion
-        self.padding_idx = padding_idx
-        assert 0.0 < smoothing <= 1.0
-        self.confidence = 1.0 - smoothing
-        self.smoothing = smoothing
-        self.size = size
-        self.true_dist = None
-        self.normalize_length = normalize_length
-
-    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        """
-        Compute loss between x and target.
-
-        Args:
-          x:
-            prediction of dimension
-            (batch_size, input_length, number_of_classes).
-          target:
-            target masked with self.padding_id of
-            dimension (batch_size, input_length).
-
-        Returns:
-          A scalar tensor containing the loss without normalization.
-        """
-        assert x.size(2) == self.size
-        #  batch_size = x.size(0)
-        x = x.view(-1, self.size)
-        target = target.view(-1)
-        with torch.no_grad():
-            true_dist = x.clone()
-            true_dist.fill_(self.smoothing / (self.size - 1))
-            ignore = target == self.padding_idx  # (B,)
-            total = len(target) - ignore.sum().item()
-            target = target.masked_fill(ignore, 0)  # avoid -1 index
-            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
-        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
-        #  denom = total if self.normalize_length else batch_size
-        denom = total if self.normalize_length else 1
-        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
-
-
 def encoder_padding_mask(
    max_len: int, supervisions: Optional[Supervisions] = None
 ) -> Optional[torch.Tensor]: