Apply delay penalty on transducer (#654)

* add delay penalty * fix CI * fix CI
2025-12-09 22:15:28 +00:00 · 2022-11-04 16:10:09 +08:00 · 2022-11-04 16:10:09 +08:00 · 3600ce1b5f
commit 3600ce1b5f
parent 65b85b732c
13 changed files with 113 additions and 5 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -79,6 +79,9 @@ jobs:
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf

+          pip install kaldifst
+          pip install onnxruntime
+
          pip install -r requirements.txt

      - name: Install graphviz
--- a/egs/librispeech/ASR/lstm_transducer_stateless/model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
@ -81,6 +81,7 @@ class Transducer(nn.Module):
        lm_scale: float = 0.0,
        warmup: float = 1.0,
        reduction: str = "sum",
+        delay_penalty: float = 0.0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
@ -108,6 +109,11 @@ class Transducer(nn.Module):
            "sum" to sum the losses over all utterances in the batch.
            "none" to return the loss in a 1-D tensor for each utterance
            in the batch.
+          delay_penalty:
+            A constant value used to penalize symbol delay, to encourage
+            streaming models to emit symbols earlier.
+            See https://github.com/k2-fsa/k2/issues/955 and
+            https://arxiv.org/pdf/2211.00490.pdf for more details.
        Returns:
          Return the transducer loss.

@ -164,6 +170,7 @@ class Transducer(nn.Module):
                am_only_scale=am_scale,
                boundary=boundary,
                reduction=reduction,
+                delay_penalty=delay_penalty,
                return_grad=True,
            )

@ -196,6 +203,7 @@ class Transducer(nn.Module):
                ranges=ranges,
                termination_symbol=blank_id,
                boundary=boundary,
+                delay_penalty=delay_penalty,
                reduction=reduction,
            )

--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@ -318,6 +318,16 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--delay-penalty",
+        type=float,
+        default=0.0,
+        help="""A constant value used to penalize symbol delay,
+        to encourage streaming models to emit symbols earlier.
+        See https://github.com/k2-fsa/k2/issues/955 and
+        https://arxiv.org/pdf/2211.00490.pdf for more details.""",
+    )
+
    add_model_arguments(parser)

    return parser
@ -611,6 +621,7 @@ def compute_loss(
            lm_scale=params.lm_scale,
            warmup=warmup,
            reduction="none",
+            delay_penalty=params.delay_penalty if warmup >= 2.0 else 0,
        )
        simple_loss_is_finite = torch.isfinite(simple_loss)
        pruned_loss_is_finite = torch.isfinite(pruned_loss)
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/model.py
@ -106,6 +106,7 @@ class Transducer(nn.Module):
        lm_scale: float = 0.0,
        warmup: float = 1.0,
        reduction: str = "sum",
+        delay_penalty: float = 0.0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
@ -136,6 +137,11 @@ class Transducer(nn.Module):
            "sum" to sum the losses over all utterances in the batch.
            "none" to return the loss in a 1-D tensor for each utterance
            in the batch.
+          delay_penalty:
+            A constant value used to penalize symbol delay, to encourage
+            streaming models to emit symbols earlier.
+            See https://github.com/k2-fsa/k2/issues/955 and
+            https://arxiv.org/pdf/2211.00490.pdf for more details.
        Returns:
          Return the transducer loss.

@ -203,6 +209,7 @@ class Transducer(nn.Module):
                am_only_scale=am_scale,
                boundary=boundary,
                reduction=reduction,
+                delay_penalty=delay_penalty,
                return_grad=True,
            )

@ -235,6 +242,7 @@ class Transducer(nn.Module):
                ranges=ranges,
                termination_symbol=blank_id,
                boundary=boundary,
+                delay_penalty=delay_penalty,
                reduction=reduction,
            )

--- a/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
@ -341,6 +341,16 @@ def get_parser():
        help="The probability to select a batch from the GigaSpeech dataset",
    )

+    parser.add_argument(
+        "--delay-penalty",
+        type=float,
+        default=0.0,
+        help="""A constant value used to penalize symbol delay,
+        to encourage streaming models to emit symbols earlier.
+        See https://github.com/k2-fsa/k2/issues/955 and
+        https://arxiv.org/pdf/2211.00490.pdf for more details.""",
+    )
+
    add_model_arguments(parser)

    return parser
@ -665,6 +675,7 @@ def compute_loss(
            lm_scale=params.lm_scale,
            warmup=warmup,
            reduction="none",
+            delay_penalty=params.delay_penalty if warmup >= 2.0 else 0,
        )
        simple_loss_is_finite = torch.isfinite(simple_loss)
        pruned_loss_is_finite = torch.isfinite(pruned_loss)
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/train.py
@ -328,6 +328,16 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--delay-penalty",
+        type=float,
+        default=0.0,
+        help="""A constant value used to penalize symbol delay,
+        to encourage streaming models to emit symbols earlier.
+        See https://github.com/k2-fsa/k2/issues/955 and
+        https://arxiv.org/pdf/2211.00490.pdf for more details.""",
+    )
+
    add_model_arguments(parser)

    return parser
@ -623,6 +633,7 @@ def compute_loss(
            lm_scale=params.lm_scale,
            warmup=warmup,
            reduction="none",
+            delay_penalty=params.delay_penalty if warmup >= 2.0 else 0,
        )
        simple_loss_is_finite = torch.isfinite(simple_loss)
        pruned_loss_is_finite = torch.isfinite(pruned_loss)
--- a/egs/librispeech/ASR/pruned_transducer_stateless/test_model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/test_model.py
@ -23,7 +23,6 @@ To run this file, do:
    python ./pruned_transducer_stateless/test_model.py
 """

-import torch
 from train import get_params, get_transducer_model


@ -43,8 +42,6 @@ def test_model():

    num_param = sum([p.numel() for p in model.parameters()])
    print(f"Number of model parameters: {num_param}")
-    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
-    torch.jit.script(model)


 def test_model_streaming():
@ -63,8 +60,6 @@ def test_model_streaming():

    num_param = sum([p.numel() for p in model.parameters()])
    print(f"Number of model parameters: {num_param}")
-    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
-    torch.jit.script(model)


 def main():
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
@ -81,6 +81,7 @@ class Transducer(nn.Module):
        lm_scale: float = 0.0,
        warmup: float = 1.0,
        reduction: str = "sum",
+        delay_penalty: float = 0.0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
@ -108,6 +109,12 @@ class Transducer(nn.Module):
            "sum" to sum the losses over all utterances in the batch.
            "none" to return the loss in a 1-D tensor for each utterance
            in the batch.
+          delay_penalty:
+            A constant value used to penalize symbol delay, to encourage
+            streaming models to emit symbols earlier.
+            See https://github.com/k2-fsa/k2/issues/955 and
+            https://arxiv.org/pdf/2211.00490.pdf for more details.
+        Returns:
        Returns:
          Return the transducer loss.

@ -164,6 +171,7 @@ class Transducer(nn.Module):
                am_only_scale=am_scale,
                boundary=boundary,
                reduction=reduction,
+                delay_penalty=delay_penalty,
                return_grad=True,
            )

@ -196,6 +204,7 @@ class Transducer(nn.Module):
                ranges=ranges,
                termination_symbol=blank_id,
                boundary=boundary,
+                delay_penalty=delay_penalty,
                reduction=reduction,
            )

--- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
@ -317,6 +317,16 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--delay-penalty",
+        type=float,
+        default=0.0,
+        help="""A constant value used to penalize symbol delay,
+        to encourage streaming models to emit symbols earlier.
+        See https://github.com/k2-fsa/k2/issues/955 and
+        https://arxiv.org/pdf/2211.00490.pdf for more details.""",
+    )
+
    add_model_arguments(parser)

    return parser
@ -607,6 +617,7 @@ def compute_loss(
            lm_scale=params.lm_scale,
            warmup=warmup,
            reduction="none",
+            delay_penalty=params.delay_penalty if warmup >= 2.0 else 0,
        )
        simple_loss_is_finite = torch.isfinite(simple_loss)
        pruned_loss_is_finite = torch.isfinite(pruned_loss)
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
@ -106,6 +106,7 @@ class Transducer(nn.Module):
        lm_scale: float = 0.0,
        warmup: float = 1.0,
        reduction: str = "sum",
+        delay_penalty: float = 0.0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
@ -136,6 +137,11 @@ class Transducer(nn.Module):
            "sum" to sum the losses over all utterances in the batch.
            "none" to return the loss in a 1-D tensor for each utterance
            in the batch.
+          delay_penalty:
+            A constant value used to penalize symbol delay, to encourage
+            streaming models to emit symbols earlier.
+            See https://github.com/k2-fsa/k2/issues/955 and
+            https://arxiv.org/pdf/2211.00490.pdf for more details.
        Returns:
          Return the transducer loss.

@ -203,6 +209,7 @@ class Transducer(nn.Module):
                am_only_scale=am_scale,
                boundary=boundary,
                reduction=reduction,
+                delay_penalty=delay_penalty,
                return_grad=True,
            )

@ -235,6 +242,7 @@ class Transducer(nn.Module):
                ranges=ranges,
                termination_symbol=blank_id,
                boundary=boundary,
+                delay_penalty=delay_penalty,
                reduction=reduction,
            )

--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@ -328,6 +328,16 @@ def get_parser():
        help="The probability to select a batch from the GigaSpeech dataset",
    )

+    parser.add_argument(
+        "--delay-penalty",
+        type=float,
+        default=0.0,
+        help="""A constant value used to penalize symbol delay,
+        to encourage streaming models to emit symbols earlier.
+        See https://github.com/k2-fsa/k2/issues/955 and
+        https://arxiv.org/pdf/2211.00490.pdf for more details.""",
+    )
+
    add_model_arguments(parser)
    return parser

@ -645,6 +655,7 @@ def compute_loss(
            lm_scale=params.lm_scale,
            warmup=warmup,
            reduction="none",
+            delay_penalty=params.delay_penalty if warmup >= 2.0 else 0,
        )
        simple_loss_is_finite = torch.isfinite(simple_loss)
        pruned_loss_is_finite = torch.isfinite(pruned_loss)
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
@ -335,6 +335,16 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--delay-penalty",
+        type=float,
+        default=0.0,
+        help="""A constant value used to penalize symbol delay,
+        to encourage streaming models to emit symbols earlier.
+        See https://github.com/k2-fsa/k2/issues/955 and
+        https://arxiv.org/pdf/2211.00490.pdf for more details.""",
+    )
+
    add_model_arguments(parser)

    return parser
@ -638,6 +648,7 @@ def compute_loss(
            lm_scale=params.lm_scale,
            warmup=warmup,
            reduction="none",
+            delay_penalty=params.delay_penalty if warmup >= 2.0 else 0,
        )
        simple_loss_is_finite = torch.isfinite(simple_loss)
        pruned_loss_is_finite = torch.isfinite(pruned_loss)
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
@ -368,6 +368,16 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--delay-penalty",
+        type=float,
+        default=0.0,
+        help="""A constant value used to penalize symbol delay,
+        to encourage streaming models to emit symbols earlier.
+        See https://github.com/k2-fsa/k2/issues/955 and
+        https://arxiv.org/pdf/2211.00490.pdf for more details.""",
+    )
+
    add_model_arguments(parser)

    return parser
@ -662,6 +672,7 @@ def compute_loss(
            lm_scale=params.lm_scale,
            warmup=warmup,
            reduction="none",
+            delay_penalty=params.delay_penalty if warmup >= 2.0 else 0,
        )
        simple_loss_is_finite = torch.isfinite(simple_loss)
        pruned_loss_is_finite = torch.isfinite(pruned_loss)