From 923b60a7c6c7924692834d2b80d03a6b2e67b9ec Mon Sep 17 00:00:00 2001
From: "LIyong.Guo" <guonwpu@qq.com>
Date: Wed, 28 Sep 2022 21:20:33 +0800
Subject: [PATCH] padding zeros (#591)

---
 egs/aishell/ASR/conformer_ctc/conformer.py          | 13 +++++++++++--
 egs/aishell/ASR/conformer_mmi/conformer.py          | 13 +++++++++++--
 egs/aishell/ASR/transducer_stateless/conformer.py   | 13 +++++++++++--
 egs/gigaspeech/ASR/conformer_ctc/conformer.py       | 13 +++++++++++--
 egs/librispeech/ASR/conformer_ctc/conformer.py      | 13 +++++++++++--
 egs/librispeech/ASR/conformer_ctc2/conformer.py     | 13 +++++++++++--
 egs/librispeech/ASR/conformer_mmi/conformer.py      | 13 +++++++++++--
 .../ASR/pruned_transducer_stateless2/conformer.py   |  8 +++++++-
 .../ASR/pruned_transducer_stateless5/conformer.py   | 13 +++++++++++--
 .../ASR/pruned_transducer_stateless6/conformer.py   | 13 +++++++++++--
 .../ASR/transducer_stateless/conformer.py           | 13 ++++++++++++-
 .../ASR/pruned_transducer_stateless5/conformer.py   |  8 +++++++-
 12 files changed, 125 insertions(+), 21 deletions(-)

diff --git a/egs/aishell/ASR/conformer_ctc/conformer.py b/egs/aishell/ASR/conformer_ctc/conformer.py
index 1e3e7b492..cb7205e51 100644
--- a/egs/aishell/ASR/conformer_ctc/conformer.py
+++ b/egs/aishell/ASR/conformer_ctc/conformer.py
@@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
         residual = src
         if self.normalize_before:
             src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
         if not self.normalize_before:
             src = self.norm_conv(src)
 
@@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
         )
         self.activation = Swish()
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
         x = self.activation(self.norm(x))
 
diff --git a/egs/aishell/ASR/conformer_mmi/conformer.py b/egs/aishell/ASR/conformer_mmi/conformer.py
index 1e3e7b492..cb7205e51 100644
--- a/egs/aishell/ASR/conformer_mmi/conformer.py
+++ b/egs/aishell/ASR/conformer_mmi/conformer.py
@@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
         residual = src
         if self.normalize_before:
             src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
         if not self.normalize_before:
             src = self.norm_conv(src)
 
@@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
         )
         self.activation = Swish()
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
         x = self.activation(self.norm(x))
 
diff --git a/egs/aishell/ASR/transducer_stateless/conformer.py b/egs/aishell/ASR/transducer_stateless/conformer.py
index 66eb3eb63..64114253d 100644
--- a/egs/aishell/ASR/transducer_stateless/conformer.py
+++ b/egs/aishell/ASR/transducer_stateless/conformer.py
@@ -246,7 +246,9 @@ class ConformerEncoderLayer(nn.Module):
         residual = src
         if self.normalize_before:
             src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
         if not self.normalize_before:
             src = self.norm_conv(src)
 
@@ -877,11 +879,16 @@ class ConvolutionModule(nn.Module):
         )
         self.activation = Swish()
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -895,6 +902,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
         # x is (batch, channels, time)
         x = x.permute(0, 2, 1)
diff --git a/egs/gigaspeech/ASR/conformer_ctc/conformer.py b/egs/gigaspeech/ASR/conformer_ctc/conformer.py
index 36e0c7aea..6fac07f93 100644
--- a/egs/gigaspeech/ASR/conformer_ctc/conformer.py
+++ b/egs/gigaspeech/ASR/conformer_ctc/conformer.py
@@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module):
         residual = src
         if self.normalize_before:
             src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
         if not self.normalize_before:
             src = self.norm_conv(src)
 
@@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module):
         )
         self.activation = Swish()
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
         if self.use_batchnorm:
             x = self.norm(x)
diff --git a/egs/librispeech/ASR/conformer_ctc/conformer.py b/egs/librispeech/ASR/conformer_ctc/conformer.py
index 36e0c7aea..6fac07f93 100644
--- a/egs/librispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/conformer.py
@@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module):
         residual = src
         if self.normalize_before:
             src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
         if not self.normalize_before:
             src = self.norm_conv(src)
 
@@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module):
         )
         self.activation = Swish()
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
         if self.use_batchnorm:
             x = self.norm(x)
diff --git a/egs/librispeech/ASR/conformer_ctc2/conformer.py b/egs/librispeech/ASR/conformer_ctc2/conformer.py
index fb11a5fc8..b906d2650 100644
--- a/egs/librispeech/ASR/conformer_ctc2/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py
@@ -268,7 +268,9 @@ class ConformerEncoderLayer(nn.Module):
         src = src + self.dropout(src_att)
 
         # convolution module
-        src = src + self.dropout(self.conv_module(src))
+        src = src + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
 
         # feed forward module
         src = src + self.dropout(self.feed_forward(src))
@@ -921,11 +923,16 @@ class ConvolutionModule(nn.Module):
             initial_scale=0.25,
         )
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -941,6 +948,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
 
         x = self.deriv_balancer2(x)
diff --git a/egs/librispeech/ASR/conformer_mmi/conformer.py b/egs/librispeech/ASR/conformer_mmi/conformer.py
index b5f22825d..97c8d83a2 100644
--- a/egs/librispeech/ASR/conformer_mmi/conformer.py
+++ b/egs/librispeech/ASR/conformer_mmi/conformer.py
@@ -247,7 +247,9 @@ class ConformerEncoderLayer(nn.Module):
         residual = src
         if self.normalize_before:
             src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
         if not self.normalize_before:
             src = self.norm_conv(src)
 
@@ -878,11 +880,16 @@ class ConvolutionModule(nn.Module):
         )
         self.activation = Swish()
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -896,6 +903,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
         x = self.activation(self.norm(x))
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
index 9a0405c57..c10678549 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@@ -527,7 +527,9 @@ class ConformerEncoderLayer(nn.Module):
         src = src + self.dropout(src_att)
 
         # convolution module
-        conv, _ = self.conv_module(src)
+        conv, _ = self.conv_module(
+            src, src_key_padding_mask=src_key_padding_mask
+        )
         src = src + self.dropout(conv)
 
         # feed forward module
@@ -1457,6 +1459,7 @@ class ConvolutionModule(nn.Module):
         x: Tensor,
         cache: Optional[Tensor] = None,
         right_context: int = 0,
+        src_key_padding_mask: Optional[Tensor] = None,
     ) -> Tuple[Tensor, Tensor]:
         """Compute convolution module.
 
@@ -1467,6 +1470,7 @@ class ConvolutionModule(nn.Module):
             right_context:
               How many future frames the attention can see in current chunk.
               Note: It's not that each individual frame has `right_context` frames
+            src_key_padding_mask: the mask for the src keys per batch (optional).
               of right context, some have more.
 
         Returns:
@@ -1486,6 +1490,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         if self.causal and self.lorder > 0:
             if cache is None:
                 # Make depthwise_conv causal by
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
index 9d63cb123..427b06294 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
@@ -527,7 +527,9 @@ class ConformerEncoderLayer(nn.Module):
         src = src + self.dropout(src_att)
 
         # convolution module
-        conv, _ = self.conv_module(src)
+        conv, _ = self.conv_module(
+            src, src_key_padding_mask=src_key_padding_mask
+        )
         src = src + self.dropout(conv)
 
         # feed forward module
@@ -1436,7 +1438,11 @@ class ConvolutionModule(nn.Module):
         )
 
     def forward(
-        self, x: Tensor, cache: Optional[Tensor] = None, right_context: int = 0
+        self,
+        x: Tensor,
+        cache: Optional[Tensor] = None,
+        right_context: int = 0,
+        src_key_padding_mask: Optional[Tensor] = None,
     ) -> Tuple[Tensor, Tensor]:
         """Compute convolution module.
 
@@ -1448,6 +1454,7 @@ class ConvolutionModule(nn.Module):
               How many future frames the attention can see in current chunk.
               Note: It's not that each individual frame has `right_context` frames
               of right context, some have more.
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -1466,6 +1473,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         if self.causal and self.lorder > 0:
             if cache is None:
                 # Make depthwise_conv causal by
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
index 90f2c8b1d..53788b3f7 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
@@ -264,7 +264,9 @@ class ConformerEncoderLayer(nn.Module):
         src = src + self.dropout(src_att)
 
         # convolution module
-        src = src + self.dropout(self.conv_module(src))
+        src = src + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
 
         # feed forward module
         src = src + self.dropout(self.feed_forward(src))
@@ -927,11 +929,16 @@ class ConvolutionModule(nn.Module):
             initial_scale=0.25,
         )
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -947,6 +954,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         x = self.depthwise_conv(x)
 
         x = self.deriv_balancer2(x)
diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py
index 2bf633201..cde52c9fc 100644
--- a/egs/librispeech/ASR/transducer_stateless/conformer.py
+++ b/egs/librispeech/ASR/transducer_stateless/conformer.py
@@ -514,7 +514,9 @@ class ConformerEncoderLayer(nn.Module):
         if self.normalize_before:
             src = self.norm_conv(src)
 
-        src, _ = self.conv_module(src)
+        src, _ = self.conv_module(
+            src, src_key_padding_mask=src_key_padding_mask
+        )
         src = residual + self.dropout(src)
 
         if not self.normalize_before:
@@ -1383,11 +1385,18 @@ class ConvolutionModule(nn.Module):
         x: Tensor,
         cache: Optional[Tensor] = None,
         right_context: int = 0,
+        src_key_padding_mask: Optional[Tensor] = None,
     ) -> Tuple[Tensor, Tensor]:
         """Compute convolution module.
 
         Args:
             x: Input tensor (#time, batch, channels).
+            cache: The cache of depthwise_conv, only used in real streaming
+                decoding.
+            right_context:
+              How many future frames the attention can see in current chunk.
+              Note: It's not that each individual frame has `right_context` frames
+            src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Returns:
             Tensor: Output tensor (#time, batch, channels).
@@ -1401,6 +1410,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         if self.causal and self.lorder > 0:
             if cache is None:
                 # Make depthwise_conv causal by
diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py b/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py
index 78baa2b78..dd27c17f0 100644
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless5/conformer.py
@@ -520,7 +520,9 @@ class ConformerEncoderLayer(nn.Module):
         src = src + self.dropout(src_att)
 
         # convolution module
-        conv, _ = self.conv_module(src)
+        conv, _ = self.conv_module(
+            src, src_key_padding_mask=src_key_padding_mask
+        )
         src = src + self.dropout(conv)
 
         # feed forward module
@@ -1392,6 +1394,7 @@ class ConvolutionModule(nn.Module):
         x: Tensor,
         cache: Optional[Tensor] = None,
         right_context: int = 0,
+        src_key_padding_mask: Optional[Tensor] = None,
     ) -> Tuple[Tensor, Tensor]:
         """Compute convolution module.
         Args:
@@ -1402,6 +1405,7 @@ class ConvolutionModule(nn.Module):
               How many future frames the attention can see in current chunk.
               Note: It's not that each individual frame has `right_context` frames
               of right context, some have more.
+            src_key_padding_mask: the mask for the src keys per batch (optional).
         Returns:
             If cache is None return the output tensor (#time, batch, channels).
             If cache is not None, return a tuple of Tensor, the first one is
@@ -1418,6 +1422,8 @@ class ConvolutionModule(nn.Module):
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
         if self.causal and self.lorder > 0:
             if cache is None:
                 # Make depthwise_conv causal by