Configuration changes: scores limit 5->10, min_prob 0.05->0.1, cur_grad_scale more aggressive increase

2022-10-22 14:09:53 +08:00 · 2022-10-22 14:09:53 +08:00 · 84580ec022
commit 84580ec022
parent 9672dffac2
3 changed files with 3 additions and 3 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
@ -649,7 +649,7 @@ class AttentionDownsample(torch.nn.Module):
        scores = (src * self.query).sum(dim=-1, keepdim=True)

        scores =  penalize_abs_values_gt(scores,
-                                         limit=5.0,
+                                         limit=10.0,
                                         penalty=1.0e-04)

        weights = scores.softmax(dim=1)
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
@ -482,7 +482,7 @@ class ActivationBalancer(torch.nn.Module):
            scale_gain_factor: float = 0.02,
            min_abs: float = 0.2,
            max_abs: float = 100.0,
-            min_prob: float = 0.05,
+            min_prob: float = 0.1,
    ):
        super(ActivationBalancer, self).__init__()
        self.num_channels = num_channels
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@ -850,7 +850,7 @@ def train_one_epoch(
            # of the grad scaler is configurable, but we can't configure it to have different
            # behavior depending on the current grad scale.
            cur_grad_scale = scaler._scale.item()
-            if cur_grad_scale < 1.0:
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
                scaler.update(cur_grad_scale * 2.0)
            if cur_grad_scale < 0.01:
                logging.warn("Grad scale is small: {cur_grad_scale}")