Revert warmup_batches change; make code change to avoid non in attn_weights

2023-02-11 18:46:05 +08:00 · 2023-02-11 18:46:05 +08:00 · f9f546968c
commit f9f546968c
parent b0c87a93d2
2 changed files with 7 additions and 4 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@ -1119,8 +1119,7 @@ def run(rank, world_size, args):
        clipping_scale=2.0,
    )

-    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs,
-                     warmup_batches=1000.0)
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)

    if checkpoints and "optimizer" in checkpoints:
        logging.info("Loading optimizer state dict")
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -1412,13 +1412,17 @@ class RelPositionMultiheadAttentionWeights(nn.Module):

        if attn_mask is not None:
            assert attn_mask.dtype == torch.bool
-            attn_scores = attn_scores.masked_fill(attn_mask, float("-inf"))
+            # use -1000 to avoid nan's where attn_mask and key_padding_mask make
+            # all scores zero.  It's important that this be large enough that exp(-1000)
+            # is exactly zero, for reasons related to const_attention_rate, it
+            # compares the final weights with zero.
+            attn_scores = attn_scores.masked_fill(attn_mask, -1000)

        if key_padding_mask is not None:
            assert key_padding_mask.shape == (batch_size, seq_len), key_padding_mask.shape
            attn_scores = attn_scores.masked_fill(
                key_padding_mask.unsqueeze(1),
-                float("-inf"),
+                -1000,
            )

        # We use our own version of softmax, defined in scaling.py, which should