mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-19 05:54:20 +00:00
Small numerical improvements; config change of eps and G_diag changed 1.01 to 1.005; decrease an eps from 1e-10 to 1e-20
This commit is contained in:
parent
cb67540cdc
commit
0666789cb8
@ -124,9 +124,6 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
||||
|
||||
(4) is for smoothing the grad covariance used for (2)
|
||||
|
||||
(5) is for smoothing the inverse Z^{-1} final learning-rate matrix Z relative to
|
||||
its own diagonal. Only the cov_min[4] is actually used, we ignore
|
||||
cov_max[4]
|
||||
cov_pow: This was mainly added for development and experimentation purposes;
|
||||
it allows you to smooth the parameter covariance matrices at the
|
||||
stages (1), (2), (3) of smoothing mentioned above, and also
|
||||
@ -166,8 +163,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
||||
lr=3e-02,
|
||||
betas=(0.9, 0.98),
|
||||
size_lr_scale=0.1,
|
||||
cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.05),
|
||||
cov_max=(10.0, 80.0, 5.0, 400.0, 100.0),
|
||||
cov_min=(0.025, 0.0025, 0.02, 0.0001),
|
||||
cov_max=(10.0, 80.0, 5.0, 400.0),
|
||||
cov_pow=(1.0, 1.0, 1.0, 1.0),
|
||||
param_rms_smooth0=0.4,
|
||||
param_rms_smooth1=0.2,
|
||||
@ -720,10 +717,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
||||
p_shape: The shape of the parameter we are optimizing
|
||||
P: a Tensor of shape (batch_size, num_blocks, block_size, block_size),
|
||||
containing the parameter covariance
|
||||
G: the gradient covariance, of shape (batch_size, num_blocks,
|
||||
G: the gradient covariance, of shape (batch_size, num_blocks,
|
||||
block_size, block_size)
|
||||
|
||||
|
||||
state[f"param_cov_{dim}"], which is an estimate of the covariance of the parameter
|
||||
p, averaged over time, and taken over dimension `dim` of the tensor.
|
||||
|
||||
@ -765,7 +761,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
||||
|
||||
G = G.clone()
|
||||
G_diag = _diag(G) # aliased
|
||||
G_diag *= 1.01 # improve its condition, for numerical reasons.
|
||||
G_diag *= 1.005 # improve its condition, for numerical reasons.
|
||||
G = self._smooth_cov(G,
|
||||
group["cov_min"][3],
|
||||
group["cov_max"][3],
|
||||
@ -774,6 +770,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
||||
# C C^T == G.
|
||||
C = G.cholesky()
|
||||
|
||||
P_orig = P.clone()
|
||||
|
||||
# treat the last dim of C as being in an arbitrary space, its next-to-last dim
|
||||
# is the "canonical" one that we need to sum with the dims of P.
|
||||
P_gnorm = torch.matmul(C.transpose(2, 3),
|
||||
@ -841,7 +839,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
||||
X: the batch of symmetric positive definite tensors we are smoothing;
|
||||
of shape (batch_size, num_blocks, block_size, block_size)
|
||||
"""
|
||||
eps = 1.0e-10
|
||||
eps = 1.0e-20
|
||||
if power != 1.0:
|
||||
U, S, _ = _svd(X)
|
||||
S_mean = _mean(S, exclude_dims=[0], keepdim=True)
|
||||
@ -866,6 +864,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
||||
_diag(X_inv).add_(1. / (max_eig * cur_diag_mean))
|
||||
X = X_inv.inverse()
|
||||
X /= _mean(_diag(X), exclude_dims=[0], keepdim=True).unsqueeze(-1)
|
||||
X = 0.5 * (X + X.transpose(-2, -1)) # make sure exactly symmetric.
|
||||
return X
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user