Increase initial-lr from 0.04 to 0.05, plus changes for diagnostics

This commit is contained in:
Daniel Povey 2022-10-18 11:45:24 +08:00
parent 2675944f01
commit b988bc0e33
2 changed files with 8 additions and 2 deletions

View File

@ -863,6 +863,10 @@ class RelPositionMultiheadAttention(nn.Module):
self.linear_pos = ScaledLinear(embed_dim, attention_dim // 2, bias=False, self.linear_pos = ScaledLinear(embed_dim, attention_dim // 2, bias=False,
initial_scale=0.05) initial_scale=0.05)
# the following are for diagnosics only, see --print-diagnostics option
self.copy_pos_query = nn.Identity()
self.copy_query = nn.Identity()
self.in_balancer = ActivationBalancer(3 * attention_dim, self.in_balancer = ActivationBalancer(3 * attention_dim,
channel_dim=-1, max_abs=5.0) channel_dim=-1, max_abs=5.0)
self.out_proj = ScaledLinear( self.out_proj = ScaledLinear(
@ -1008,9 +1012,11 @@ class RelPositionMultiheadAttention(nn.Module):
q, k, pv = x.chunk(3, dim=-1) q, k, pv = x.chunk(3, dim=-1)
p, v = pv.chunk(2, dim=-1) p, v = pv.chunk(2, dim=-1)
k = self.whiten_keys(k) # does nothing in the forward pass. k = self.whiten_keys(k) # does nothing in the forward pass.
v = self.whiten_values(v) # does nothing in the forward pass. v = self.whiten_values(v) # does nothing in the forward pass.
q = self.copy_query(q) # for diagnostics only, does nothing.
p = self.copy_pos_query(p) # for diagnostics only, does nothing.
if attn_mask is not None: if attn_mask is not None:
assert ( assert (

View File

@ -230,7 +230,7 @@ def get_parser():
parser.add_argument( parser.add_argument(
"--initial-lr", "--initial-lr",
type=float, type=float,
default=0.04, default=0.05,
help="The initial learning rate. This value should not need " help="The initial learning rate. This value should not need "
"to be changed.", "to be changed.",
) )