Reduce layer-drop prob

2025-08-09 10:02:22 +00:00 · 2022-03-26 19:36:33 +08:00 · 2022-03-26 19:36:33 +08:00 · b43468bb67
commit b43468bb67
parent 0e694739f2
1 changed files with 8 additions and 9 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@ -222,21 +222,20 @@ class ConformerEncoderLayer(nn.Module):
        src_orig = src
        # when warmup == 0.0, alpha is always 0.1, but it gradually changes to
        # always being 1.0 when warmup equals 1.0.  The reason for using 0.1 and not
-        # 0.0 is that it gives us a gradient so we can learn something when we are not
-        # being very useful.  The occasional 1.0 will ensure, via self.balancer, that
-        # the outputs of our modules don't get scaled up too much.
-
+        # 0.0 is that it gives us a gradient so we can learn something when we are turned
+        # off.
+        #
        # min(0.1, warmup)
        # is used in place of warmup to ensure that even at the start of the warm-up
        # period we sometimes use scale 1.0; this ensures that the modules do not
        # compensate for the small scale by just producing larger output.
        warmup = max(warmup, 0.1)
        if self.training:
-            warmup = min(warmup, 0.95)  # effectively, layer-drop.
+            warmup = min(warmup, 0.98)  # effectively, layer-drop with 1-in-50 prob.
        alpha = 1.0 if torch.rand(()).item() <= warmup else 0.1

        # macaron style feed forward module
-        src = torch.add(src, self.dropout(self.feed_forward_macaron(src)))
+        src = src + self.dropout(self.feed_forward_macaron(src))


        # multi-headed self-attention module
@ -248,13 +247,13 @@ class ConformerEncoderLayer(nn.Module):
            attn_mask=src_mask,
            key_padding_mask=src_key_padding_mask,
        )[0]
-        src = torch.add(src, self.dropout(src_att))
+        src = src + self.dropout(src_att)

        # convolution module
-        src = torch.add(src,  self.dropout(self.conv_module(src)))
+        src = src + self.dropout(self.conv_module(src))

        # feed forward module
-        src = torch.add(src, self.dropout(self.feed_forward(src)))
+        src = src + self.dropout(self.feed_forward(src))

        src = self.norm_final(self.balancer(src))