mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Change initialization to 0.25
This commit is contained in:
parent
d1f2f93460
commit
179d0605ea
@ -140,7 +140,8 @@ class Eve(Optimizer):
|
|||||||
|
|
||||||
# Suppose we are going to shrinkage with a small value epsilon (not the
|
# Suppose we are going to shrinkage with a small value epsilon (not the
|
||||||
# same as the eps above!), i.e. param *= (1-epsilon). Then
|
# same as the eps above!), i.e. param *= (1-epsilon). Then
|
||||||
# if E[param_elem^2] == target_rms^2,
|
# if E[param_elem^2] == target_rms^2 (because we desire equilibrium when
|
||||||
|
# the RMS of the parameters equals target_rms), it follows that
|
||||||
# E[(param_elem*(1-epsilon))^2] == target_rms^2 (1 - 2epsilon + epsilon^2),
|
# E[(param_elem*(1-epsilon))^2] == target_rms^2 (1 - 2epsilon + epsilon^2),
|
||||||
# which we can put as:
|
# which we can put as:
|
||||||
# delta_var_from_shrinkage \simeq -2 epsilon target_rms^2.
|
# delta_var_from_shrinkage \simeq -2 epsilon target_rms^2.
|
||||||
|
@ -161,7 +161,7 @@ class ScaledLinear(nn.Linear):
|
|||||||
# we plan to use Eve as the optimizer, which will eventually make the stddev approach
|
# we plan to use Eve as the optimizer, which will eventually make the stddev approach
|
||||||
# 0.1 as that's the target_rms we set, but we initialize with a larger stddev
|
# 0.1 as that's the target_rms we set, but we initialize with a larger stddev
|
||||||
# to have the same effect as a warm-up period.
|
# to have the same effect as a warm-up period.
|
||||||
std = 0.5 / initial_speed
|
std = 0.25 / initial_speed
|
||||||
a = (3 ** 0.5) * std
|
a = (3 ** 0.5) * std
|
||||||
nn.init.uniform_(self.weight, -a, a)
|
nn.init.uniform_(self.weight, -a, a)
|
||||||
if self.bias is not None:
|
if self.bias is not None:
|
||||||
@ -199,7 +199,7 @@ class ScaledConv1d(nn.Conv1d):
|
|||||||
self._reset_parameters(initial_speed) # Overrides the reset_parameters in base class
|
self._reset_parameters(initial_speed) # Overrides the reset_parameters in base class
|
||||||
|
|
||||||
def _reset_parameters(self, initial_speed: float):
|
def _reset_parameters(self, initial_speed: float):
|
||||||
std = 0.5 / initial_speed
|
std = 0.25 / initial_speed
|
||||||
a = (3 ** 0.5) * std
|
a = (3 ** 0.5) * std
|
||||||
nn.init.uniform_(self.weight, -a, a)
|
nn.init.uniform_(self.weight, -a, a)
|
||||||
if self.bias is not None:
|
if self.bias is not None:
|
||||||
@ -244,7 +244,7 @@ class ScaledConv2d(nn.Conv2d):
|
|||||||
self._reset_parameters(initial_speed) # Overrides the reset_parameters in base class
|
self._reset_parameters(initial_speed) # Overrides the reset_parameters in base class
|
||||||
|
|
||||||
def _reset_parameters(self, initial_speed: float):
|
def _reset_parameters(self, initial_speed: float):
|
||||||
std = 0.5 / initial_speed
|
std = 0.25 / initial_speed
|
||||||
a = (3 ** 0.5) * std
|
a = (3 ** 0.5) * std
|
||||||
nn.init.uniform_(self.weight, -a, a)
|
nn.init.uniform_(self.weight, -a, a)
|
||||||
if self.bias is not None:
|
if self.bias is not None:
|
||||||
@ -480,7 +480,7 @@ class ScaledEmbedding(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
def reset_parameters(self, initial_speed: float = 1.0) -> None:
|
def reset_parameters(self, initial_speed: float = 1.0) -> None:
|
||||||
std = 0.5 / initial_speed
|
std = 0.25 / initial_speed
|
||||||
nn.init.normal_(self.weight, std=std)
|
nn.init.normal_(self.weight, std=std)
|
||||||
nn.init.constant_(self.scale, torch.tensor(1.0/std).log())
|
nn.init.constant_(self.scale, torch.tensor(1.0/std).log())
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user