mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-10 18:42:19 +00:00
Remove initial_speed
This commit is contained in:
parent
a41e93437c
commit
61486a0f76
@ -956,30 +956,22 @@ class Conv2dSubsampling(nn.Module):
|
|||||||
assert in_channels >= 7
|
assert in_channels >= 7
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# This initial_speed is to slightly slow down the relative speed of
|
|
||||||
# training during the warmup phase by increasing the magnitude of the
|
|
||||||
# initial parameter values. The intention is to allow us to
|
|
||||||
# use a higher lr_factor.
|
|
||||||
initial_speed = 0.5
|
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
ScaledConv2d(
|
ScaledConv2d(
|
||||||
in_channels=1, out_channels=layer1_channels,
|
in_channels=1, out_channels=layer1_channels,
|
||||||
kernel_size=3, padding=1,
|
kernel_size=3, padding=1,
|
||||||
initial_speed=initial_speed,
|
|
||||||
),
|
),
|
||||||
ActivationBalancer(channel_dim=1),
|
ActivationBalancer(channel_dim=1),
|
||||||
DoubleSwish(),
|
DoubleSwish(),
|
||||||
ScaledConv2d(
|
ScaledConv2d(
|
||||||
in_channels=layer1_channels, out_channels=layer2_channels,
|
in_channels=layer1_channels, out_channels=layer2_channels,
|
||||||
kernel_size=3, stride=2,
|
kernel_size=3, stride=2,
|
||||||
initial_speed=initial_speed,
|
|
||||||
),
|
),
|
||||||
ActivationBalancer(channel_dim=1),
|
ActivationBalancer(channel_dim=1),
|
||||||
DoubleSwish(),
|
DoubleSwish(),
|
||||||
ScaledConv2d(
|
ScaledConv2d(
|
||||||
in_channels=layer2_channels, out_channels=layer3_channels,
|
in_channels=layer2_channels, out_channels=layer3_channels,
|
||||||
kernel_size=3, stride=2,
|
kernel_size=3, stride=2,
|
||||||
initial_speed=initial_speed,
|
|
||||||
),
|
),
|
||||||
ActivationBalancer(channel_dim=1),
|
ActivationBalancer(channel_dim=1),
|
||||||
DoubleSwish(),
|
DoubleSwish(),
|
||||||
|
@ -56,16 +56,10 @@ class Decoder(nn.Module):
|
|||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# This initial_speed is to slightly slow down the relative speed of
|
|
||||||
# training during the warmup phase by increasing the magnitude of the
|
|
||||||
# initial parameter values. The intention is to allow us to
|
|
||||||
# use a higher lr_factor.
|
|
||||||
initial_speed = 0.5
|
|
||||||
self.embedding = ScaledEmbedding(
|
self.embedding = ScaledEmbedding(
|
||||||
num_embeddings=vocab_size,
|
num_embeddings=vocab_size,
|
||||||
embedding_dim=decoder_dim,
|
embedding_dim=decoder_dim,
|
||||||
padding_idx=blank_id,
|
padding_idx=blank_id,
|
||||||
initial_speed=initial_speed
|
|
||||||
)
|
)
|
||||||
self.blank_id = blank_id
|
self.blank_id = blank_id
|
||||||
|
|
||||||
|
@ -27,9 +27,6 @@ class Joiner(nn.Module):
|
|||||||
vocab_size: int):
|
vocab_size: int):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# We don't bother giving the 'initial_speed' arg to the decoder
|
|
||||||
# submodules, because it does not affect the initial convergence of the
|
|
||||||
# system (only the simple joiner is involved in that).
|
|
||||||
self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim)
|
self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim)
|
||||||
self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim)
|
self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim)
|
||||||
self.output_linear = ScaledLinear(joiner_dim, vocab_size)
|
self.output_linear = ScaledLinear(joiner_dim, vocab_size)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user