From c0e652ea87f93fcaf70631cdbe9e56a0d105527f Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 30 Jul 2022 09:01:11 +0800 Subject: [PATCH 1/2] Fix code, was smoothing in wrong basis. --- .../ASR/pruned_transducer_stateless7/optim.py | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py index 4146ec4b4..83ce90a92 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py @@ -166,7 +166,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of lr=3e-02, betas=(0.9, 0.98), size_lr_scale=0.1, - cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.1), + cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.025), cov_max=(10.0, 80.0, 5.0, 400.0, 100.0), cov_pow=(1.0, 1.0, 1.0, 1.0), param_rms_smooth0=0.4, @@ -969,30 +969,24 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of _check_similar(G_prime, G_prime_check, "G_prime") - Z_prime_inv_diag = _diag(Z_prime_inv) # aliased with Z_prime_inv + Z_inv = torch.matmul(U_g, torch.matmul(Z_prime_inv, U_g.transpose(2, 3))) + Z_inv = 0.5 * (Z_inv + Z_inv.transpose(2, 3)) # make sure exactly symmetric + + Z_inv_diag = _diag(Z_inv) # aliased with Z_inv # this is smoothing Z relative to its own diagonal. This is z_inv, # so by applying a minimum here, we are applying a maximum of the # eigs of Z after normalizing so the diagonal is 1. - Z_prime_inv_diag *= (1. + group["cov_min"][4]) + Z_inv_diag *= (1. + group["cov_min"][4]) # We really want the SVD on Z, which will be used for the learning-rate matrix # Q, but Z_prime is better, numerically, to work on because it's closer to # being diagonalized. - U_z_prime, S_z_prime_inv, _ = _svd(Z_prime_inv) + U_z, S_z_inv, _ = _svd(Z_inv) - U_z = torch.matmul(U_g, U_z_prime) - # We could obtain Z in two possible ways. - # Commenting the check below. - # Z_a = torch.matmul(U_z * S.unsqueeze(-2), U_z.transpose(2, 3)) - # Z_b = torch.matmul(U_g, torch.matmul(Z_prime, U_g.transpose(2, 3))) - # _check_similar(Z_a, Z_b, "Z") - ## OK, Z is the SPD transform that maps G to P, as in Z G Z = P. - ## We just need the basis U_z that diagonalizes this. - ## U_z, S, _ = _svd(Z) if True: skip = 10 if S.shape[-1] > 40 else 1 - logging.info(f"dim={dim}, G_prime is {G_prime[0,0,::skip]}, Eigs of Z_inv are: {S_z_prime_inv[0,0,::skip]}") + logging.info(f"dim={dim}, G_prime is {G_prime[0,0,::skip]}, Eigs of Z_inv are: {S_z_inv[0,0,::skip]}") # state[f"Q_{dim}"] is indexed: [batch_idx, block_idx, diagonalized_coordinate, canonical_coordinate]. # so we need to transpose U_z as U_z is indexed From 4d5323d6a3f36e6bcc35f99ca40ffc05bcc7cf40 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 30 Jul 2022 09:06:23 +0800 Subject: [PATCH 2/2] Change cov_min to intermediate value, not seeing clear diffs from changing it --- egs/librispeech/ASR/pruned_transducer_stateless7/optim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py index 83ce90a92..c89a2bd3e 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py @@ -166,7 +166,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of lr=3e-02, betas=(0.9, 0.98), size_lr_scale=0.1, - cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.025), + cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.05), cov_max=(10.0, 80.0, 5.0, 400.0, 100.0), cov_pow=(1.0, 1.0, 1.0, 1.0), param_rms_smooth0=0.4, @@ -2215,7 +2215,7 @@ def _test_eve_cain(): fix_random_seed(42) Linear = torch.nn.Linear if iter == 0 else ScaledLinear - hidden_dim = 768 + hidden_dim = 200 m = torch.nn.Sequential(Linear(E, hidden_dim), torch.nn.PReLU(), Linear(hidden_dim, hidden_dim),