minor fix for zipformer recipe (#758)

* minor fix * add CI test
2025-08-08 09:32:20 +00:00 · 2022-12-13 15:47:30 +08:00 · 2022-12-13 15:47:30 +08:00 · 0470bbae66
commit 0470bbae66
parent b25c234c51
4 changed files with 24 additions and 16 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -113,6 +113,9 @@ jobs:
          cd ../pruned_transducer_stateless4
          pytest -v -s

+          cd ../pruned_transducer_stateless7
+          pytest -v -s
+
          cd ../transducer_stateless
          pytest -v -s

--- a/egs/librispeech/ASR/pruned_transducer_stateless7/export.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/export.py
@ -294,7 +294,6 @@ def main():

    if params.jit is True:
        convert_scaled_to_non_scaled(model, inplace=True)
-        logging.info("Using torch.jit.script()")
        # We won't use the forward() method of the model in C++, so just ignore
        # it here.
        # Otherwise, one of its arguments is a ragged tensor and is not
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py
@ -20,19 +20,21 @@
 To run this file, do:

    cd icefall/egs/librispeech/ASR
-    python ./pruned_transducer_stateless4/test_model.py
+    python ./pruned_transducer_stateless7/test_model.py
 """

+import torch
+
+from scaling_converter import convert_scaled_to_non_scaled
 from train import get_params, get_transducer_model


-def test_model_1():
+def test_model():
    params = get_params()
    params.vocab_size = 500
    params.blank_id = 0
    params.context_size = 2
    params.num_encoder_layers = "2,4,3,2,4"
-    #  params.feedforward_dims = "1024,1024,1536,1536,1024"
    params.feedforward_dims = "1024,1024,2048,2048,1024"
    params.nhead = "8,8,8,8,8"
    params.encoder_dims = "384,384,384,384,384"
@ -47,9 +49,19 @@ def test_model_1():
    num_param = sum([p.numel() for p in model.parameters()])
    print(f"Number of model parameters: {num_param}")

+    # Test jit script
+    convert_scaled_to_non_scaled(model, inplace=True)
+    # We won't use the forward() method of the model in C++, so just ignore
+    # it here.
+    # Otherwise, one of its arguments is a ragged tensor and is not
+    # torch scriptabe.
+    model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+    print("Using torch.jit.script")
+    model = torch.jit.script(model)
+

 def main():
-    test_model_1()
+    test_model()


 if __name__ == "__main__":
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c)  2021  University of Chinese Academy of Sciences (author: Han Zhu)
+# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -454,7 +454,7 @@ class ZipformerEncoderLayer(nn.Module):
        # pooling module
        if torch.jit.is_scripting():
            src = src + self.pooling(src, key_padding_mask=src_key_padding_mask)
-        elif random.random() > dynamic_dropout:
+        elif random.random() >= dynamic_dropout:
            src = src + self.pooling(src, key_padding_mask=src_key_padding_mask)

        if torch.jit.is_scripting():
@ -478,7 +478,7 @@ class ZipformerEncoderLayer(nn.Module):
                src, src_key_padding_mask=src_key_padding_mask
            )
        else:
-            use_self_attn = random.random() > dynamic_dropout
+            use_self_attn = random.random() >= dynamic_dropout
            if use_self_attn:
                src_att, attn_weights = self.self_attn(
                    src,
@ -488,7 +488,7 @@ class ZipformerEncoderLayer(nn.Module):
                )
                src = src + src_att

-            if random.random() > dynamic_dropout:
+            if random.random() >= dynamic_dropout:
                src = src + self.conv_module1(
                    src, src_key_padding_mask=src_key_padding_mask
                )
@ -497,7 +497,7 @@ class ZipformerEncoderLayer(nn.Module):
            if use_self_attn:
                src = src + self.self_attn.forward2(src, attn_weights)

-            if random.random() > dynamic_dropout:
+            if random.random() >= dynamic_dropout:
                src = src + self.conv_module2(
                    src, src_key_padding_mask=src_key_padding_mask
                )
@ -1289,12 +1289,6 @@ class RelPositionMultiheadAttention(nn.Module):
            bsz * num_heads, seq_len, seq_len
        )

-        assert list(attn_output_weights.size()) == [
-            bsz * num_heads,
-            seq_len,
-            seq_len,
-        ]
-
        if attn_mask is not None:
            if attn_mask.dtype == torch.bool:
                attn_output_weights.masked_fill_(attn_mask, float("-inf"))