From 923b60a7c6c7924692834d2b80d03a6b2e67b9ec Mon Sep 17 00:00:00 2001 From: "LIyong.Guo" Date: Wed, 28 Sep 2022 21:20:33 +0800 Subject: [PATCH] padding zeros (#591) --- egs/aishell/ASR/conformer_ctc/conformer.py | 13 +++++++++++-- egs/aishell/ASR/conformer_mmi/conformer.py | 13 +++++++++++-- egs/aishell/ASR/transducer_stateless/conformer.py | 13 +++++++++++-- egs/gigaspeech/ASR/conformer_ctc/conformer.py | 13 +++++++++++-- egs/librispeech/ASR/conformer_ctc/conformer.py | 13 +++++++++++-- egs/librispeech/ASR/conformer_ctc2/conformer.py | 13 +++++++++++-- egs/librispeech/ASR/conformer_mmi/conformer.py | 13 +++++++++++-- .../ASR/pruned_transducer_stateless2/conformer.py | 8 +++++++- .../ASR/pruned_transducer_stateless5/conformer.py | 13 +++++++++++-- .../ASR/pruned_transducer_stateless6/conformer.py | 13 +++++++++++-- .../ASR/transducer_stateless/conformer.py | 13 ++++++++++++- .../ASR/pruned_transducer_stateless5/conformer.py | 8 +++++++- 12 files changed, 125 insertions(+), 21 deletions(-) diff --git a/egs/aishell/ASR/conformer_ctc/conformer.py b/egs/aishell/ASR/conformer_ctc/conformer.py index 1e3e7b492..cb7205e51 100644 --- a/egs/aishell/ASR/conformer_ctc/conformer.py +++ b/egs/aishell/ASR/conformer_ctc/conformer.py @@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module): residual = src if self.normalize_before: src = self.norm_conv(src) - src = residual + self.dropout(self.conv_module(src)) + src = residual + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) if not self.normalize_before: src = self.norm_conv(src) @@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module): ) self.activation = Swish() - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) x = self.activation(self.norm(x)) diff --git a/egs/aishell/ASR/conformer_mmi/conformer.py b/egs/aishell/ASR/conformer_mmi/conformer.py index 1e3e7b492..cb7205e51 100644 --- a/egs/aishell/ASR/conformer_mmi/conformer.py +++ b/egs/aishell/ASR/conformer_mmi/conformer.py @@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module): residual = src if self.normalize_before: src = self.norm_conv(src) - src = residual + self.dropout(self.conv_module(src)) + src = residual + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) if not self.normalize_before: src = self.norm_conv(src) @@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module): ) self.activation = Swish() - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) x = self.activation(self.norm(x)) diff --git a/egs/aishell/ASR/transducer_stateless/conformer.py b/egs/aishell/ASR/transducer_stateless/conformer.py index 66eb3eb63..64114253d 100644 --- a/egs/aishell/ASR/transducer_stateless/conformer.py +++ b/egs/aishell/ASR/transducer_stateless/conformer.py @@ -246,7 +246,9 @@ class ConformerEncoderLayer(nn.Module): residual = src if self.normalize_before: src = self.norm_conv(src) - src = residual + self.dropout(self.conv_module(src)) + src = residual + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) if not self.normalize_before: src = self.norm_conv(src) @@ -877,11 +879,16 @@ class ConvolutionModule(nn.Module): ) self.activation = Swish() - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -895,6 +902,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) # x is (batch, channels, time) x = x.permute(0, 2, 1) diff --git a/egs/gigaspeech/ASR/conformer_ctc/conformer.py b/egs/gigaspeech/ASR/conformer_ctc/conformer.py index 36e0c7aea..6fac07f93 100644 --- a/egs/gigaspeech/ASR/conformer_ctc/conformer.py +++ b/egs/gigaspeech/ASR/conformer_ctc/conformer.py @@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module): residual = src if self.normalize_before: src = self.norm_conv(src) - src = residual + self.dropout(self.conv_module(src)) + src = residual + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) if not self.normalize_before: src = self.norm_conv(src) @@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module): ) self.activation = Swish() - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) if self.use_batchnorm: x = self.norm(x) diff --git a/egs/librispeech/ASR/conformer_ctc/conformer.py b/egs/librispeech/ASR/conformer_ctc/conformer.py index 36e0c7aea..6fac07f93 100644 --- a/egs/librispeech/ASR/conformer_ctc/conformer.py +++ b/egs/librispeech/ASR/conformer_ctc/conformer.py @@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module): residual = src if self.normalize_before: src = self.norm_conv(src) - src = residual + self.dropout(self.conv_module(src)) + src = residual + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) if not self.normalize_before: src = self.norm_conv(src) @@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module): ) self.activation = Swish() - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) if self.use_batchnorm: x = self.norm(x) diff --git a/egs/librispeech/ASR/conformer_ctc2/conformer.py b/egs/librispeech/ASR/conformer_ctc2/conformer.py index fb11a5fc8..b906d2650 100644 --- a/egs/librispeech/ASR/conformer_ctc2/conformer.py +++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py @@ -268,7 +268,9 @@ class ConformerEncoderLayer(nn.Module): src = src + self.dropout(src_att) # convolution module - src = src + self.dropout(self.conv_module(src)) + src = src + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) # feed forward module src = src + self.dropout(self.feed_forward(src)) @@ -921,11 +923,16 @@ class ConvolutionModule(nn.Module): initial_scale=0.25, ) - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -941,6 +948,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) x = self.deriv_balancer2(x) diff --git a/egs/librispeech/ASR/conformer_mmi/conformer.py b/egs/librispeech/ASR/conformer_mmi/conformer.py index b5f22825d..97c8d83a2 100644 --- a/egs/librispeech/ASR/conformer_mmi/conformer.py +++ b/egs/librispeech/ASR/conformer_mmi/conformer.py @@ -247,7 +247,9 @@ class ConformerEncoderLayer(nn.Module): residual = src if self.normalize_before: src = self.norm_conv(src) - src = residual + self.dropout(self.conv_module(src)) + src = residual + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) if not self.normalize_before: src = self.norm_conv(src) @@ -878,11 +880,16 @@ class ConvolutionModule(nn.Module): ) self.activation = Swish() - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -896,6 +903,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) x = self.activation(self.norm(x)) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py index 9a0405c57..c10678549 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py @@ -527,7 +527,9 @@ class ConformerEncoderLayer(nn.Module): src = src + self.dropout(src_att) # convolution module - conv, _ = self.conv_module(src) + conv, _ = self.conv_module( + src, src_key_padding_mask=src_key_padding_mask + ) src = src + self.dropout(conv) # feed forward module @@ -1457,6 +1459,7 @@ class ConvolutionModule(nn.Module): x: Tensor, cache: Optional[Tensor] = None, right_context: int = 0, + src_key_padding_mask: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor]: """Compute convolution module. @@ -1467,6 +1470,7 @@ class ConvolutionModule(nn.Module): right_context: How many future frames the attention can see in current chunk. Note: It's not that each individual frame has `right_context` frames + src_key_padding_mask: the mask for the src keys per batch (optional). of right context, some have more. Returns: @@ -1486,6 +1490,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) if self.causal and self.lorder > 0: if cache is None: # Make depthwise_conv causal by diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py index 9d63cb123..427b06294 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py @@ -527,7 +527,9 @@ class ConformerEncoderLayer(nn.Module): src = src + self.dropout(src_att) # convolution module - conv, _ = self.conv_module(src) + conv, _ = self.conv_module( + src, src_key_padding_mask=src_key_padding_mask + ) src = src + self.dropout(conv) # feed forward module @@ -1436,7 +1438,11 @@ class ConvolutionModule(nn.Module): ) def forward( - self, x: Tensor, cache: Optional[Tensor] = None, right_context: int = 0 + self, + x: Tensor, + cache: Optional[Tensor] = None, + right_context: int = 0, + src_key_padding_mask: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor]: """Compute convolution module. @@ -1448,6 +1454,7 @@ class ConvolutionModule(nn.Module): How many future frames the attention can see in current chunk. Note: It's not that each individual frame has `right_context` frames of right context, some have more. + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -1466,6 +1473,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) if self.causal and self.lorder > 0: if cache is None: # Make depthwise_conv causal by diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py index 90f2c8b1d..53788b3f7 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py @@ -264,7 +264,9 @@ class ConformerEncoderLayer(nn.Module): src = src + self.dropout(src_att) # convolution module - src = src + self.dropout(self.conv_module(src)) + src = src + self.dropout( + self.conv_module(src, src_key_padding_mask=src_key_padding_mask) + ) # feed forward module src = src + self.dropout(self.feed_forward(src)) @@ -927,11 +929,16 @@ class ConvolutionModule(nn.Module): initial_scale=0.25, ) - def forward(self, x: Tensor) -> Tensor: + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -947,6 +954,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) x = self.depthwise_conv(x) x = self.deriv_balancer2(x) diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py index 2bf633201..cde52c9fc 100644 --- a/egs/librispeech/ASR/transducer_stateless/conformer.py +++ b/egs/librispeech/ASR/transducer_stateless/conformer.py @@ -514,7 +514,9 @@ class ConformerEncoderLayer(nn.Module): if self.normalize_before: src = self.norm_conv(src) - src, _ = self.conv_module(src) + src, _ = self.conv_module( + src, src_key_padding_mask=src_key_padding_mask + ) src = residual + self.dropout(src) if not self.normalize_before: @@ -1383,11 +1385,18 @@ class ConvolutionModule(nn.Module): x: Tensor, cache: Optional[Tensor] = None, right_context: int = 0, + src_key_padding_mask: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor]: """Compute convolution module. Args: x: Input tensor (#time, batch, channels). + cache: The cache of depthwise_conv, only used in real streaming + decoding. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: Tensor: Output tensor (#time, batch, channels). @@ -1401,6 +1410,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) if self.causal and self.lorder > 0: if cache is None: # Make depthwise_conv causal by diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py b/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py index 78baa2b78..dd27c17f0 100644 --- a/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py +++ b/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py @@ -520,7 +520,9 @@ class ConformerEncoderLayer(nn.Module): src = src + self.dropout(src_att) # convolution module - conv, _ = self.conv_module(src) + conv, _ = self.conv_module( + src, src_key_padding_mask=src_key_padding_mask + ) src = src + self.dropout(conv) # feed forward module @@ -1392,6 +1394,7 @@ class ConvolutionModule(nn.Module): x: Tensor, cache: Optional[Tensor] = None, right_context: int = 0, + src_key_padding_mask: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor]: """Compute convolution module. Args: @@ -1402,6 +1405,7 @@ class ConvolutionModule(nn.Module): How many future frames the attention can see in current chunk. Note: It's not that each individual frame has `right_context` frames of right context, some have more. + src_key_padding_mask: the mask for the src keys per batch (optional). Returns: If cache is None return the output tensor (#time, batch, channels). If cache is not None, return a tuple of Tensor, the first one is @@ -1418,6 +1422,8 @@ class ConvolutionModule(nn.Module): x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv + if src_key_padding_mask is not None: + x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) if self.causal and self.lorder > 0: if cache is None: # Make depthwise_conv causal by