From 0470bbae66d2c9ebc91ee5d0dfa37dfb4df3a9cb Mon Sep 17 00:00:00 2001 From: Zengwei Yao Date: Tue, 13 Dec 2022 15:47:30 +0800 Subject: [PATCH] minor fix for zipformer recipe (#758) * minor fix * add CI test --- .github/workflows/test.yml | 3 +++ .../pruned_transducer_stateless7/export.py | 1 - .../test_model.py | 20 +++++++++++++++---- .../pruned_transducer_stateless7/zipformer.py | 16 +++++---------- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4dbe99827..c062a2a3d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -113,6 +113,9 @@ jobs: cd ../pruned_transducer_stateless4 pytest -v -s + cd ../pruned_transducer_stateless7 + pytest -v -s + cd ../transducer_stateless pytest -v -s diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/export.py b/egs/librispeech/ASR/pruned_transducer_stateless7/export.py index 9a6f3ed37..3e3160e7e 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/export.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/export.py @@ -294,7 +294,6 @@ def main(): if params.jit is True: convert_scaled_to_non_scaled(model, inplace=True) - logging.info("Using torch.jit.script()") # We won't use the forward() method of the model in C++, so just ignore # it here. # Otherwise, one of its arguments is a ragged tensor and is not diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py b/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py index db7fb7b3e..cdf914df3 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py @@ -20,19 +20,21 @@ To run this file, do: cd icefall/egs/librispeech/ASR - python ./pruned_transducer_stateless4/test_model.py + python ./pruned_transducer_stateless7/test_model.py """ +import torch + +from scaling_converter import convert_scaled_to_non_scaled from train import get_params, get_transducer_model -def test_model_1(): +def test_model(): params = get_params() params.vocab_size = 500 params.blank_id = 0 params.context_size = 2 params.num_encoder_layers = "2,4,3,2,4" - # params.feedforward_dims = "1024,1024,1536,1536,1024" params.feedforward_dims = "1024,1024,2048,2048,1024" params.nhead = "8,8,8,8,8" params.encoder_dims = "384,384,384,384,384" @@ -47,9 +49,19 @@ def test_model_1(): num_param = sum([p.numel() for p in model.parameters()]) print(f"Number of model parameters: {num_param}") + # Test jit script + convert_scaled_to_non_scaled(model, inplace=True) + # We won't use the forward() method of the model in C++, so just ignore + # it here. + # Otherwise, one of its arguments is a ragged tensor and is not + # torch scriptabe. + model.__class__.forward = torch.jit.ignore(model.__class__.forward) + print("Using torch.jit.script") + model = torch.jit.script(model) + def main(): - test_model_1() + test_model() if __name__ == "__main__": diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index e8fd89abd..ed1e2efa2 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (c) 2021 University of Chinese Academy of Sciences (author: Han Zhu) +# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -454,7 +454,7 @@ class ZipformerEncoderLayer(nn.Module): # pooling module if torch.jit.is_scripting(): src = src + self.pooling(src, key_padding_mask=src_key_padding_mask) - elif random.random() > dynamic_dropout: + elif random.random() >= dynamic_dropout: src = src + self.pooling(src, key_padding_mask=src_key_padding_mask) if torch.jit.is_scripting(): @@ -478,7 +478,7 @@ class ZipformerEncoderLayer(nn.Module): src, src_key_padding_mask=src_key_padding_mask ) else: - use_self_attn = random.random() > dynamic_dropout + use_self_attn = random.random() >= dynamic_dropout if use_self_attn: src_att, attn_weights = self.self_attn( src, @@ -488,7 +488,7 @@ class ZipformerEncoderLayer(nn.Module): ) src = src + src_att - if random.random() > dynamic_dropout: + if random.random() >= dynamic_dropout: src = src + self.conv_module1( src, src_key_padding_mask=src_key_padding_mask ) @@ -497,7 +497,7 @@ class ZipformerEncoderLayer(nn.Module): if use_self_attn: src = src + self.self_attn.forward2(src, attn_weights) - if random.random() > dynamic_dropout: + if random.random() >= dynamic_dropout: src = src + self.conv_module2( src, src_key_padding_mask=src_key_padding_mask ) @@ -1289,12 +1289,6 @@ class RelPositionMultiheadAttention(nn.Module): bsz * num_heads, seq_len, seq_len ) - assert list(attn_output_weights.size()) == [ - bsz * num_heads, - seq_len, - seq_len, - ] - if attn_mask is not None: if attn_mask.dtype == torch.bool: attn_output_weights.masked_fill_(attn_mask, float("-inf"))