Implement layer dropout (in a relatively efficient way)

This commit is contained in:
Daniel Povey 2022-10-08 16:07:20 +08:00
parent af545e061b
commit b1fa3d50fb

View File

@ -46,7 +46,6 @@ class Conformer(EncoderInterface):
dim_feedforward (int): feedforward dimention
num_encoder_layers (int): number of encoder layers
dropout (float): dropout rate
layer_dropout (float): layer-dropout rate.
cnn_module_kernel (int): Kernel size of convolution module
vgg_frontend (bool): whether to use vgg frontend.
warmup_batches (float): number of batches to warm up over
@ -403,9 +402,13 @@ class ConformerEncoder(nn.Module):
super().__init__()
# keep track of how many times forward() has been called, for purposes of
# warmup
# warmup. do this with a floating-point count because integer counts can
# fail to survive model averaging.
self.register_buffer('warmup_count', torch.tensor(0.0))
# if this assert fails, increase the numbers in get_warmup_count().
assert warmup_end <= 1000000.0
self.encoder_pos = RelPositionalEncoding(encoder_layer.d_model,
dropout)
@ -432,10 +435,38 @@ class ConformerEncoder(nn.Module):
"""
ans = self.warmup_count.item()
if self.training:
self.warmup_count += 1
if ans > 1000000.0:
# this ensures that as the number of batches gets large, the warmup count cycles rather
# than getting stuck at the smallest floating point value x such that x + 1 == x.
# this is necessary because get_layers_to_drop() relies on the warmup count changing
# on every batch.
next_count = 500000.0
else:
next_count = ans + 1.0
self.warmup_count.fill_(next_count)
return ans
def get_layers_to_drop(self, warmup_count: float):
ans = set()
if not self.training:
return ans
# We use a random number generator seeded from warmup_count because
# if there are multiple training processes we want them to all drop the
# same number of layers (not necessarily the same layers though). This
# will tend to minimize training time.
rng = random.Random(int(warmup_count))
num_layers = len(self.layers)
# x is the expected number of layers to drop
x = 0.075 * num_layers
# integerize x in a way that preserves sxpectations.
num_layers_to_drop = int(x) + int(rng.random() < (x - int(x)))
while (len(ans) < num_layers_to_drop):
# use random, not rng here, because we don't want the same specific layers to be dropped.
ans.add(random.randrange(0, num_layers))
return ans
def forward(
self,
src: Tensor,
@ -468,9 +499,13 @@ class ConformerEncoder(nn.Module):
outputs = []
attn_scores = None
layers_to_drop = self.get_layers_to_drop(warmup_count)
output = output * feature_mask
for i, mod in enumerate(self.layers):
if i in layers_to_drop:
continue
next_output, attn_scores = mod(
output,
pos_emb,