From 3110138ab578eb18c465b2f5ee76cd0428921e19 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 30 Jul 2022 07:30:32 +0800 Subject: [PATCH 1/2] Smooth grad_cov with eps; add a 4th stage of smoothing, this time on Z_inv. --- .../ASR/pruned_transducer_stateless7/optim.py | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py index 945e3ab19..5ebe88b4d 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py @@ -114,7 +114,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of param covariance equals the dimension of the covaraince matrix. param_rms_smooth{0,1} determine the smoothing proportions for other conditions. - cov_min,cov_max: [IMPORTANT] 4-tuples of minimums and maximums of the diagonal values of + cov_min,cov_max: [IMPORTANT] 5-tuples of minimums and maximums of the diagonal values of covariance matrices, after normalizing to unit-mean. The first 3 are for smoothing the parameter covariance, normalized in 3 different ways: (1) relative to its own diagonal (in a basis that diagonalizes the grad @@ -123,6 +123,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of (3) in the canonical basis. (4) is for smoothing the grad covariance used for (2) + + (5) is for smoothing the final learning-rate matrix Z relative to + its own diagonal. Only the cov_max is actually used. + the cov_min ends up not affecting the result, so we set it + to 0.0. cov_pow: This was mainly added for development and experimentation purposes; it allows you to smooth the parameter covariance matrices at the stages (1), (2), (3) of smoothing mentioned above, and also @@ -162,8 +167,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of lr=3e-02, betas=(0.9, 0.98), size_lr_scale=0.1, - cov_min=(0.025, 0.0025, 0.02, 0.0001), - cov_max=(10.0, 80.0, 5.0, 400.0), + cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.0), + cov_max=(10.0, 80.0, 5.0, 400.0, 10.0), cov_pow=(1.0, 1.0, 1.0, 1.0), param_rms_smooth0=0.4, param_rms_smooth1=0.2, @@ -877,6 +882,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of """ ndim = len(p_shape) P_proj = [None] * ndim + denom_rel_eps = group["denom_rel_eps"] + eps = group["eps"] + for dim in range(1, ndim): size = p_shape[dim] try: @@ -899,6 +907,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of # dimensions with zero grad separate from those with nonzero grad. G_prime = _diag(torch.matmul(U_g.transpose(2,3), torch.matmul(grad_cov, U_g))) + # Use the form of the diagonalized gradient matrix that we get after + # we add the Adam-type smoothing with epsilon. + G_prime += (_mean(G_prime, exclude_dims=[0], keepdim=True) *(denom_rel_eps * denom_rel_eps) + + (eps * eps)) + # P_prime is P' above, which represents param_cov in the basis that diagonalizes G_prime. # It is not smoothed yet. P_prime = torch.matmul(U_g.transpose(2, 3), torch.matmul(param_cov, U_g)) @@ -957,12 +970,18 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of _check_similar(G_prime, G_prime_check, "G_prime") + Z_prime_inv_diag = _diag(Z_prime_inv) # aliased with Z_prime_inv + # this is smoothing Z relative to its own diagonal. This is z_inv, + # so by applying a minimum here, we are applying a maximum of the + # eigs of Z after normalizing so the diagonal is 1. + Z_prime_inv_diag *= (1. + 1. / group["cov_max"][4]) # We really want the SVD on Z, which will be used for the learning-rate matrix # Q, but Z_prime is better, numerically, to work on because it's closer to # being diagonalized. U_z_prime, S_z_prime_inv, _ = _svd(Z_prime_inv) + U_z = torch.matmul(U_g, U_z_prime) # We could obtain Z in two possible ways. # Commenting the check below. @@ -2203,7 +2222,7 @@ def _test_eve_cain(): fix_random_seed(42) Linear = torch.nn.Linear if iter == 0 else ScaledLinear - hidden_dim = 400 + hidden_dim = 768 m = torch.nn.Sequential(Linear(E, hidden_dim), torch.nn.PReLU(), Linear(hidden_dim, hidden_dim), From 17bc002e6e1439a7ad01e6828cfbc7d780418bca Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 30 Jul 2022 07:45:29 +0800 Subject: [PATCH 2/2] Refactoring that does not affect results. --- .../ASR/pruned_transducer_stateless7/optim.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py index 5ebe88b4d..4146ec4b4 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py @@ -124,10 +124,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of (4) is for smoothing the grad covariance used for (2) - (5) is for smoothing the final learning-rate matrix Z relative to - its own diagonal. Only the cov_max is actually used. - the cov_min ends up not affecting the result, so we set it - to 0.0. + (5) is for smoothing the inverse Z^{-1} final learning-rate matrix Z relative to + its own diagonal. Only the cov_min[4] is actually used, we ignore + cov_max[4] cov_pow: This was mainly added for development and experimentation purposes; it allows you to smooth the parameter covariance matrices at the stages (1), (2), (3) of smoothing mentioned above, and also @@ -167,8 +166,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of lr=3e-02, betas=(0.9, 0.98), size_lr_scale=0.1, - cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.0), - cov_max=(10.0, 80.0, 5.0, 400.0, 10.0), + cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.1), + cov_max=(10.0, 80.0, 5.0, 400.0, 100.0), cov_pow=(1.0, 1.0, 1.0, 1.0), param_rms_smooth0=0.4, param_rms_smooth1=0.2, @@ -974,7 +973,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of # this is smoothing Z relative to its own diagonal. This is z_inv, # so by applying a minimum here, we are applying a maximum of the # eigs of Z after normalizing so the diagonal is 1. - Z_prime_inv_diag *= (1. + 1. / group["cov_max"][4]) + Z_prime_inv_diag *= (1. + group["cov_min"][4]) # We really want the SVD on Z, which will be used for the learning-rate matrix # Q, but Z_prime is better, numerically, to work on because it's closer to