Changes that should not really affect the results, just cleanup.

This commit is contained in:
Daniel Povey 2022-07-30 19:20:36 -07:00
parent 5184ac570d
commit 790e8c4ba9

View File

@ -737,22 +737,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
if the parameter covariance is just the gradient covariance to some power, this if the parameter covariance is just the gradient covariance to some power, this
function does no smoothing; but if it is highly off-diagonal we do more smoothing. function does no smoothing; but if it is highly off-diagonal we do more smoothing.
""" """
P_prime_diag = _diag(P_prime) # (batch_size, num_blocks, block_size) # do smoothing based on 'rank',
eps = 1.0e-10
P_prime_diag = (P_prime_diag + eps) / P_prime_diag.mean()
# make sure no diagonal element is close to zero.. we don't expect this
# would happen. this is likely not important. Note, this just used for
# normalizing P prior to smoothing.
P_prime_diag.clamp_(min=0.01)
P_prime_rms = P_prime_diag.sqrt()
P_prime_scale = P_prime_rms.unsqueeze(-1) * P_prime_rms.unsqueeze(-2)
# P_norm will have diagonal elements close to 1. We do some smoothing
# in this space.
P_norm = P_prime / P_prime_scale
# Now P is as normalized as we can make it... do smoothing baserd on 'rank',
# that is intended to compensate for bad estimates of P. # that is intended to compensate for bad estimates of P.
(batch_size, num_blocks, block_size, block_size) = P_norm.shape (batch_size, num_blocks, block_size, block_size) = P_prime.shape
# `rank_per_block` is the rank of each block of P_prime if we were to estimate it from just one # `rank_per_block` is the rank of each block of P_prime if we were to estimate it from just one
# parameter tensor. We average it over time, but actually it won't be changing # parameter tensor. We average it over time, but actually it won't be changing
# too much, so `rank` does tell us something. # too much, so `rank` does tell us something.
@ -771,21 +758,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
smooth = smooth0 * block_size / ((smooth0/smooth1 - 1) * rank + block_size) smooth = smooth0 * block_size / ((smooth0/smooth1 - 1) * rank + block_size)
if True: if True:
logging.info(f"block size={block_size}, rank={rank}, smooth={smooth}") logging.info(f"block size={block_size}, rank={rank}, smooth={smooth}")
# add rank-dependent smoothing amount to diagonal of P_prime. _diag() returns an aliased tensor. # add rank-dependent smoothing amount to diagonal of P_prime. _diag() returns an aliased tensor.
# we don't need to multiply `smooth` by anything, because at this point, P_prime should have # we don't need to multiply `smooth` by anything, because at this point, P_prime should have
# diagonal elements close to 1. # diagonal elements close to 1.
#_diag(P_norm).add_(smooth)
#P_norm = self._smooth_cov(P_norm,
# group["cov_min"][0],
# group["cov_max"][0],
# group["cov_pow"][0])
# Remove the diagonal preconditioning on P_norm, giving us stage-1-smoothed
# version of P_prime.
P_prime = P_norm * P_prime_scale
P_prime_diag = _diag(P_prime) P_prime_diag = _diag(P_prime)
P_prime_diag_mean = _mean(P_prime_diag, exclude_dims=[0], keepdim=True) P_prime_diag_mean = _mean(P_prime_diag, exclude_dims=[0], keepdim=True)
P_prime_diag += smooth * P_prime_diag_mean P_prime_diag += smooth * P_prime_diag_mean