Implement higher layerdrop for central stacks

This commit is contained in:
Daniel Povey 2023-01-08 13:16:10 +08:00
parent ef48019d6e
commit 117db124d0

View File

@ -175,6 +175,13 @@ class Zipformer(EncoderInterface):
num_encoders = len(downsampling_factor) num_encoders = len(downsampling_factor)
for i in range(num_encoders): for i in range(num_encoders):
# give more-downsampled layers a higher final layer_skip_rate, to discourage
# the model from relying too much on them at the end. (They tend to train
# slowly, so don't increase their skip-rate at the beginning.)
layer_skip_rate = (ScheduledFloat((0.0, 0.5), (4000.0, 0.0), default=0,) +
(0.025 * (downsampling_factor[i] ** 0.5)))
encoder_layer = ZipformerEncoderLayer( encoder_layer = ZipformerEncoderLayer(
embed_dim=encoder_dim[i], embed_dim=encoder_dim[i],
pos_dim=pos_dim, pos_dim=pos_dim,
@ -185,6 +192,7 @@ class Zipformer(EncoderInterface):
feedforward_dim=feedforward_dim[i], feedforward_dim=feedforward_dim[i],
dropout=dropout, dropout=dropout,
cnn_module_kernel=cnn_module_kernel[i], cnn_module_kernel=cnn_module_kernel[i],
layer_skip_rate=layer_skip_rate,
) )
# For the segment of the warmup period, we let the Conv2dSubsampling # For the segment of the warmup period, we let the Conv2dSubsampling
@ -209,7 +217,8 @@ class Zipformer(EncoderInterface):
) )
# we are adding a new attribute here. # we are adding a new attribute here.
# this will be interpreted by get_named_parameter_groups_with_lrs(). # this will be interpreted by get_named_parameter_groups_with_lrs().
encoder.lr_scale = downsampling_factor[i] ** -0.2 encoder.lr_scale = downsampling_factor[i] ** -0.25
encoders.append(encoder) encoders.append(encoder)
self.encoders = nn.ModuleList(encoders) self.encoders = nn.ModuleList(encoders)