Have nonlin_attention and attention_squeeze operate only on every other layer.

2025-12-11 06:55:27 +00:00 · 2022-11-28 16:24:24 +08:00 · 2022-11-28 16:24:24 +08:00 · 9cf5d92f39
commit 9cf5d92f39
parent 87ef4078d3
1 changed files with 12 additions and 2 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -450,6 +450,12 @@ class ZipformerEncoderLayer(nn.Module):
    def remove_attention_weights(self):
        self.self_attn_weights = None
    def remove_nonlin_attention(self):
        self.nonlin_attention_module = None
    def remove_attention_squeeze(self):
        self.attention_squeeze = None
    def get_bypass_scale(self):
        if torch.jit.is_scripting() or not self.training:
            return self.bypass_scale
@ -520,14 +526,14 @@ class ZipformerEncoderLayer(nn.Module):
                first_attn_weights = first_attn_weights * (1.0 / first_attn_weights.sum(dim=-1, keepdim=True))
                first_attn_weights = first_attn_weights.expand(3, -1, -1, -1)
-        if torch.jit.is_scripting() or use_self_attn:
+        if (torch.jit.is_scripting() or use_self_attn) and self.nonlin_attention_module is not None:
            src = src + self.nonlin_attention_module(src,
                                                     first_attn_weights[0:1])
        src = src + self.feed_forward1(src)
        # pooling module
-        if torch.jit.is_scripting() or use_self_attn:
+        if (torch.jit.is_scripting() or use_self_attn) and self.attention_squeeze is not None:
            src = src + self.attention_squeeze(src, first_attn_weights[1:2])
        if torch.jit.is_scripting() or use_self_attn:
@ -598,6 +604,10 @@ class ZipformerEncoder(nn.Module):
            cur_begin = cur_end
            if i % attention_share_layers != 0:
                self.layers[i].remove_attention_weights()
            if i % attention_share_layers == 0:
                self.layers[i].remove_nonlin_attention()
            else:
                self.layers[i].remove_attention_squeeze()
    def forward(
        self,