Introduce nonlin_skip_rate

2022-12-08 20:35:38 +08:00 · 2022-12-08 20:35:38 +08:00 · 75a1e05e49
commit 75a1e05e49
parent 1718b2de44 a9798b3b75
1 changed files with 7 additions and 2 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -397,6 +397,7 @@ class ZipformerEncoderLayer(nn.Module):
            # to work correctly.
            layer_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), (4000.0, 0.05), default=0),
            dynamic_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0),
+            nonlin_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (40000, 0.0), default=0),
            const_attention_rate: FloatLike = ScheduledFloat((0.0, 0.25), (4000.0, 0.025), default=0),
            bypass_min: FloatLike = ScheduledFloat((0.0, 0.75), (20000.0, 0.2), default=0),
            bypass_max: FloatLike = 1.0,
@ -406,8 +407,12 @@ class ZipformerEncoderLayer(nn.Module):

        # probability of skipping the entire layer.
        self.layer_skip_rate = copy.deepcopy(layer_skip_rate)
-        # skip probability for dynamic modules (meaning: anything but feedforward)
+        # skip probability for dynamic modules (meaning: anything but feedforward).
        self.dynamic_skip_rate = copy.deepcopy(dynamic_skip_rate)
+        # an additional skip probability that applies to NoninAttentionModule to stop it from
+        # contributing too much early on.
+        self.nonlin_skip_rate = copy.deepcopy(nonlin_skip_rate)
+
        # min and max for self.bypass_scale, applied with probability 0.5 to avoid grads
        # ever becoming zero.
        self.bypass_min = copy.deepcopy(bypass_min)
@ -534,7 +539,7 @@ class ZipformerEncoderLayer(nn.Module):
                selected_attn_weights = selected_attn_weights * (1.0 / selected_attn_weights.sum(dim=-1, keepdim=True))
                selected_attn_weights = selected_attn_weights.expand(2, -1, -1, -1)

-        if torch.jit.is_scripting() or use_self_attn:
+        if torch.jit.is_scripting() or (use_self_attn and random.random() >= float(self.nonlin_skip_rate)):
            src = src + self.nonlin_attention_module(src,
                                                     selected_attn_weights[0:1])