From 006fcc18cd3291867cb49f8b17dc1b82579b2954 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 4 Oct 2022 12:06:35 +0800 Subject: [PATCH] Introduce offset in layerdrop_scaleS --- .../pruned_transducer_stateless7/conformer.py | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py index a0bba1649..94bb0aa7b 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py @@ -343,7 +343,7 @@ class ConformerEncoder(nn.Module): self.num_layers = num_layers self.layerdrop_scale_mat = nn.Parameter(0.01 * torch.randn(num_layers, num_layers)) - + self.layerdrop_scale_offset = nn.Parameter(torch.ones(num_layers)) assert num_layers - 1 not in aux_layers self.aux_layers = set(aux_layers + [num_layers - 1]) @@ -384,33 +384,32 @@ class ConformerEncoder(nn.Module): layerdrop_mask = torch.ones(num_layers, 2, device='cpu') - if not self.training or batch_size == 1: - return layerdrop_mask, None - - halves_to_drop = int(2 * num_layers * self.layer_dropout) - for _ in range(halves_to_drop): - while True: - r = random.randrange(0, 2 * num_layers) - i = r // 2 - j = r % 2 - if layerdrop_mask[i, j - 1] == 0.0: - # This position cannot be set to 0.0 because the other - # half of the batch is already 0.0 (not computed). This would lead to - # one layer not having a gradient. - continue - if ((i > 0 and layerdrop_mask[i-1, j] == 0.0) or - (i + 1 < num_layers and layerdrop_mask[i+1, j] == 0.0)): - # This position cannot be set to False because the preceding - # or following position for this same half of the batch is - # already set to False - continue - layerdrop_mask[i, j] = 0.0 - break + if self.training and batch_size != 1: + halves_to_drop = int(2 * num_layers * self.layer_dropout) + for _ in range(halves_to_drop): + while True: + r = random.randrange(0, 2 * num_layers) + i = r // 2 + j = r % 2 + if layerdrop_mask[i, j - 1] == 0.0: + # This position cannot be set to 0.0 because the other + # half of the batch is already 0.0 (not computed). This would lead to + # one layer not having a gradient. + continue + if ((i > 0 and layerdrop_mask[i-1, j] == 0.0) or + (i + 1 < num_layers and layerdrop_mask[i+1, j] == 0.0)): + # This position cannot be set to False because the preceding + # or following position for this same half of the batch is + # already set to False + continue + layerdrop_mask[i, j] = 0.0 + break # layerdrop_scales: currently shape is (2, num_layers) device = self.layerdrop_scale_mat.device - layerdrop_scales_tmp = 1.0 + torch.matmul(self.layerdrop_scale_mat, - 1.0 - layerdrop_mask.to(device)) + layerdrop_scales_tmp = (self.layerdrop_scale_offset.unsqueeze(1) + + torch.matmul(self.layerdrop_scale_mat, + 1.0 - layerdrop_mask.to(device))) layerdrop_scales = torch.empty(num_layers, batch_size, 1, device=device) mid = batch_size // 2 @@ -482,7 +481,7 @@ class ConformerEncoder(nn.Module): src_key_padding_mask=src_key_padding_mask, warmup=warmup, layerdrop_mask=layerdrop_mask[i].tolist(), # [ 1.0, 1.0 ], [0.0, 1.0] or [1.0, 0.0] - layerdrop_scales=None if layerdrop_scales is None else layerdrop_scales[i], # tensor of scales of shape (batch_size, 1) + layerdrop_scales=layerdrop_scales[i], # tensor of scales of shape (batch_size, 1) ) output = output * feature_mask if i in self.aux_layers: