Fast code seems to pass tests.

2025-09-19 05:54:20 +00:00 · 2022-06-18 12:56:16 +08:00 · 2022-06-18 12:56:16 +08:00 · a6bf32c3e0
commit a6bf32c3e0
parent ea5cd69e3b
1 changed files with 143 additions and 8 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@ -67,11 +67,19 @@ class NeutralGradient(Optimizer):
                 to the parameter rms) and 0.0 means no such normalization at all (only a scalar
                 per tensor).  In any case we fully normalize the parameter scale at the
                 whole-tensor level, for non-scalar tensors.
      lr_for_speedup: A learning rate that determines how much speedup we aim for by
                 accumulating gradients over multiple batches (the update is done after
                 different delays for different parameters, based on an estimate of their
                 importance).  The target speedup factor will be (lr_for_speedup / lr).
  speedup_first_step: The first step that we start doing the speedup.
   speedup_recalibrate_period: The number of steps between recalibrating the speedup
                 (i.e. choosing the periodicity for updating each parameter tensor)
    """
    def __init__(
            self,
            params,
-            lr=1e-2,
+            lr=3e-02,
            betas=(0.9, 0.98),
            scale_speed=0.1,
            grad_eps=1e-8,
@ -82,6 +90,9 @@ class NeutralGradient(Optimizer):
            estimate_period=2000,
            stats_steps=200,
            param_pow=1.0,
            lr_for_speedup=0.1,
            speedup_first_step=500,  # TEMP.  Make this 3500? must be > warmup.
            speedup_recalibrate_period=100,
    ):
        if not 0.0 <= lr:
@ -119,7 +130,11 @@ class NeutralGradient(Optimizer):
            max_fullcov_size=max_fullcov_size,
            estimate_period=estimate_period,
            stats_steps=stats_steps,
-            param_pow=param_pow
+            param_pow=param_pow,
            lr_for_speedup=lr_for_speedup,
            speedup_first_step=speedup_first_step,
            speedup_recalibrate_period=speedup_recalibrate_period,
            group_step=0,
        )
        super(NeutralGradient, self).__init__(params, defaults)
@ -152,6 +167,24 @@ class NeutralGradient(Optimizer):
            estimate_period = group["estimate_period"]
            stats_steps = group["stats_steps"]
            param_pow = group["param_pow"]
            lr_for_speedup = group["lr_for_speedup"]
            speedup_first_step = group["speedup_first_step"]
            speedup_recalibrate_period = group["speedup_recalibrate_period"]
            group_step = group["group_step"]
            group["group_step"] = group_step + 1
            recalibrate = (group_step >= speedup_first_step and
                           (group_step - speedup_first_step) % speedup_recalibrate_period == 0)
            if recalibrate:
                # param_prods will be an array of products, one per parameter tensor, equal to:
                #   E[grad . step],
                # where "step" is the individual step, before momentum (i.e. computed as if beta1==0).
                # We compute these in such a way as to reflect the expectation if there is no speedup
                # (i.e. assuming we are doing the update on every step with no gradient accumulation).
                # The idea is that this product reflects the importance of each parameter matrix,
                # so that parameter matrices with larger values will need more frequent updates.
                self._recalibrate_speedup(group)
            for p in group["params"]:
                if p.grad is None:
@ -161,13 +194,26 @@ class NeutralGradient(Optimizer):
                grad = p.grad
                if grad.is_sparse:
                    raise RuntimeError(
-                        "Cain does not support sparse gradients"
+                        "NeutralGradient optimizer does not support sparse gradients"
                    )
                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    # For speed, we will actually take a step every
                    # `step_period` times step() is called; we will store the
                    # gradient in state["grad"] until state["n_cached_grads"]
                    # reaches step_period, at which point we'll really do the
                    # update.
                    state["step_period"] = 1
                    state["n_cached_grads"] = 0
                    # Allocate these cached_grad members immediately even though
                    # we may not need them immediately, so that any OOMs will
                    # occur quickly.
                    state["cached_grad"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )
                    state["step"] = 0
                    # Parameter step (used to implement momentum).
                    state["delta"] = torch.zeros_like(
@ -222,6 +268,33 @@ class NeutralGradient(Optimizer):
                            # but we'll allocate this and deallocate it as needed, for individual
                            # parameters, to save memory.
                step_period = state["step_period"]
                n_cached_grads = state["n_cached_grads"]
                if step_period > 1:
                    cached_grad = state["cached_grad"]
                    cached_grad += grad
                    n_cached_grads += 1
                    state["n_cached_grads"] = n_cached_grads
                    # the point of random_step_prob is to inject a little randomness
                    # into the timing of the updates, so they don't stay synchronized.
                    random_step_prob = 0.05 / step_period
                    if n_cached_grads == step_period or random.random() < random_step_prob:
                        # We will continue to the code below, and do a step
                        grad = cached_grad
                    else:
                        # Don't do a step this time.
                        continue
                else:
                    # We are doing a step on every iteration.
                    n_cached_grads = 1  # will be used to scale gradients below.
                # sqrt_n_grads will be used to scale down estimates of grad variances
                # (in situations where the scale matters), to reflect the sizes of
                # grads of individual
                inv_sqrt_n_grads = n_cached_grads ** 0.5
                delta = state["delta"]
                step = state["step"]
@ -240,7 +313,7 @@ class NeutralGradient(Optimizer):
                        scale_deriv = (p * grad).sum()
                        scale_exp_avg_sq = state["scale_exp_avg_sq"]
                        scale_exp_avg_sq.mul_(beta2).addcmul_(scale_deriv, scale_deriv,
-                                                              value=1 - beta2)
+                                                              value=(1 - beta2)/n_cached_grads)
                        # should actually be step + 1, so on 1st minibatch we are not learning
                        # anything here.  May fix this at some point.
@ -267,7 +340,7 @@ class NeutralGradient(Optimizer):
                        if step % 10 == 0:
                            scale.copy_((p**2).mean().sqrt().add_(param_eps))
                        # Decay the second moment running average coefficient
-                        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)/n_cached_grads)
                        grad_rms = (exp_avg_sq.sqrt()).add_(grad_eps)
                        this_delta = grad / grad_rms
                        alpha = (-(1-beta1) * lr * (bias_correction2 ** 0.5)) * scale
@ -288,7 +361,7 @@ class NeutralGradient(Optimizer):
                        cur_grad = self._change_coordinates(cur_grad, state, forward=True)
                        # Decay the second moment running average coefficient
-                        exp_avg_sq.mul_(beta2).addcmul_(cur_grad, cur_grad, value=1 - beta2)
+                        exp_avg_sq.mul_(beta2).addcmul_(cur_grad, cur_grad, value=(1 - beta2)/n_cached_grads)
                        # _get_step accounts for the fact that we may have reset these
                        # stats when we changed the co-ordinates.
@ -301,6 +374,8 @@ class NeutralGradient(Optimizer):
                        cur_grad = self._change_coordinates(cur_grad, state, forward=False)
                        if random.random() < 0.0002:
                            # This is only for debug.  The logic below would not be valid for n_cache_grads>0,
                            # anyway we will delete this code at some point.
                            # in principle, the cur_grad is supposed to have the same rms as params, on average.
                            cur_grad_rms = (cur_grad**2).mean().sqrt()
                            # _corrected corrects for the overall size of the grad, making cur_grad_rms more similar
@ -326,7 +401,7 @@ class NeutralGradient(Optimizer):
                    # p.numel() == 1.
                    # Decay the second moment running average coefficient
                    exp_avg_sq = state["exp_avg_sq"]
-                    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)/n_cached_grads)
                    bias_correction2 = 1 - beta2 ** (step + 1)
                    denom = (exp_avg_sq.sqrt()).add_(grad_eps)
                    this_delta = grad / denom
@ -714,6 +789,66 @@ class NeutralGradient(Optimizer):
        return P
    def _recalibrate_speedup(self, group: dict):
        param_prods = []
        speedup = group["lr_for_speedup"] / group["lr"]
        if speedup > 10.0:
            speedup = 10.0
        if speedup < 2.0:
            speedup = 2.0
        _, beta2 = group["betas"]
        for p in group["params"]:
            if p.grad is None:
                continue
            # We ignore the part of the parameter change that comes from the scale
            # (see 'scale_deriv' in the code)
            state = self.state[p]
            step = state["step"]
            prod = state["exp_avg_sq"].sqrt().sum()
            is_one_axis = (p.numel() in list(p.shape))  # e.g. a bias
            if is_one_axis or p.numel() == 1:
                if is_one_axis:
                    prod *= state["scale"]
                bias_correction2 = 1 - beta2 ** (step + 1)
            else:
                step_within_period = state["step_within_period"]
                bias_correction2 = 1 - beta2 ** (min(step, step_within_period) + 1)
            prod /= bias_correction2 ** 0.5
            param_prods.append(prod)
        param_prods = torch.stack(param_prods).to('cpu')
        avg_update_freq = 1.0 / speedup
        param_freqs = param_prods * (avg_update_freq / param_prods.mean())
        # Don't delay more than 40 steps for any parameter.  Note: with beta=0.1,
        # this would mean an average delay of about 500 steps before the gradient
        # makes it to the parameter.
        min_freq = 1.0 / 40
        # get the mean after applying limits of [min_freq..1] for the frequencies of
        # update
        param_freqs_mean = param_freqs.clamp(min=min_freq, max=1.0).mean()
        # renormalize after applying min and max limits
        param_freqs = param_freqs * (avg_update_freq / param_freqs_mean)
        # ...and apply the limits again.
        param_freqs = param_freqs.clamp(min=min_freq, max=1.0)
        param_periods = (1.0 / param_freqs).to(torch.int32)  # .to(torch.int32) rounds down
        param_prods = param_prods.tolist()
        param_freqs = param_freqs.tolist()
        param_periods = param_periods.tolist()
        print("speedup = ", speedup)
        for i,p in enumerate(group["params"]):
            if p.grad is None:
                continue
            print(f"Shape = {p.shape}, prod = {param_prods[i]}, freq = {param_freqs[i]}, period = {param_periods[i]}.")
        # Do nothing more, as yet.