mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-19 05:54:20 +00:00
Reduce some print statements, convert some to info
This commit is contained in:
parent
7f756b2910
commit
09282ca28c
@ -418,7 +418,7 @@ class NeutralGradient(Optimizer):
|
|||||||
cur_grad_rms_corrected = cur_grad_rms * ((exp_avg_sq/bias_correction2).mean().sqrt() /
|
cur_grad_rms_corrected = cur_grad_rms * ((exp_avg_sq/bias_correction2).mean().sqrt() /
|
||||||
(grad**2).mean().sqrt())
|
(grad**2).mean().sqrt())
|
||||||
param_rms = (p**2).mean().sqrt()
|
param_rms = (p**2).mean().sqrt()
|
||||||
print(f"cur_grad_rms={cur_grad_rms.item():.3e}, corrected_grad_rms={cur_grad_rms_corrected.item():.3e}, param_rms={param_rms.item():.3e}")
|
logging.info(f"cur_grad_rms={cur_grad_rms.item():.3e}, corrected_grad_rms={cur_grad_rms_corrected.item():.3e}, param_rms={param_rms.item():.3e}")
|
||||||
|
|
||||||
if random.random() < 0.0005:
|
if random.random() < 0.0005:
|
||||||
# check the cosine angle between cur_grad and grad, to see how different this update
|
# check the cosine angle between cur_grad and grad, to see how different this update
|
||||||
@ -426,7 +426,7 @@ class NeutralGradient(Optimizer):
|
|||||||
prod = (grad*cur_grad).mean()
|
prod = (grad*cur_grad).mean()
|
||||||
cos_angle = prod / ((grad**2).mean() * (cur_grad**2).mean()).sqrt()
|
cos_angle = prod / ((grad**2).mean() * (cur_grad**2).mean()).sqrt()
|
||||||
if random.random() < 0.04 or cos_angle < 0.01:
|
if random.random() < 0.04 or cos_angle < 0.01:
|
||||||
print(f"cos_angle = {cos_angle}, shape={grad.shape}")
|
logging.info(f"cos_angle = {cos_angle}, shape={grad.shape}")
|
||||||
|
|
||||||
alpha = -lr * (1-beta1)
|
alpha = -lr * (1-beta1)
|
||||||
if param_pow != 1.0 or grad_pow != 1.0:
|
if param_pow != 1.0 or grad_pow != 1.0:
|
||||||
@ -450,7 +450,7 @@ class NeutralGradient(Optimizer):
|
|||||||
delta.add_(this_delta, alpha=alpha)
|
delta.add_(this_delta, alpha=alpha)
|
||||||
|
|
||||||
if random.random() < 0.0001:
|
if random.random() < 0.0001:
|
||||||
print(f"Delta rms = {(delta**2).mean().item()}, shape = {delta.shape}")
|
logging.info(f"Delta rms = {(delta**2).mean().item()}, shape = {delta.shape}")
|
||||||
p.add_(delta)
|
p.add_(delta)
|
||||||
|
|
||||||
state["step"] += 1
|
state["step"] += 1
|
||||||
@ -622,7 +622,6 @@ class NeutralGradient(Optimizer):
|
|||||||
this_scale = (this_var ** (param_pow * 0.5)).reshape(size)
|
this_scale = (this_var ** (param_pow * 0.5)).reshape(size)
|
||||||
proj = state[f"proj_{dim}"]
|
proj = state[f"proj_{dim}"]
|
||||||
|
|
||||||
#print(f"iter={_}, dim={dim}, this_scale = {this_scale}")
|
|
||||||
if proj.ndim == 1:
|
if proj.ndim == 1:
|
||||||
proj *= this_scale
|
proj *= this_scale
|
||||||
else:
|
else:
|
||||||
@ -633,7 +632,6 @@ class NeutralGradient(Optimizer):
|
|||||||
if param_pow != 1.0:
|
if param_pow != 1.0:
|
||||||
# need to get the overall scale correct, as if we had param_pow == 1.0
|
# need to get the overall scale correct, as if we had param_pow == 1.0
|
||||||
scale = (params_sq_partnorm.mean() ** 0.5)
|
scale = (params_sq_partnorm.mean() ** 0.5)
|
||||||
print("scale = ", scale)
|
|
||||||
for dim in range(p.ndim):
|
for dim in range(p.ndim):
|
||||||
size = p.shape[dim]
|
size = p.shape[dim]
|
||||||
if size == 1:
|
if size == 1:
|
||||||
@ -1177,8 +1175,6 @@ class Cain(Optimizer):
|
|||||||
this_delta = grad / denom
|
this_delta = grad / denom
|
||||||
alpha = -lr*(1-beta1)*(bias_correction2 ** 0.5)
|
alpha = -lr*(1-beta1)*(bias_correction2 ** 0.5)
|
||||||
delta.add_(this_delta, alpha=alpha)
|
delta.add_(this_delta, alpha=alpha)
|
||||||
if random.random() < 0.0001:
|
|
||||||
print(f"Delta rms = {(delta**2).mean().item()}, shape = {delta.shape}")
|
|
||||||
|
|
||||||
p.add_(delta)
|
p.add_(delta)
|
||||||
if step % 10 == 0:
|
if step % 10 == 0:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user