padding zeros (#591)

This commit is contained in:
LIyong.Guo 2022-09-28 21:20:33 +08:00 committed by GitHub
parent 3b5846effa
commit 923b60a7c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 125 additions and 21 deletions

View File

@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
residual = src residual = src
if self.normalize_before: if self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
src = residual + self.dropout(self.conv_module(src)) src = residual + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
if not self.normalize_before: if not self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
) )
self.activation = Swish() self.activation = Swish()
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
x = self.activation(self.norm(x)) x = self.activation(self.norm(x))

View File

@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
residual = src residual = src
if self.normalize_before: if self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
src = residual + self.dropout(self.conv_module(src)) src = residual + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
if not self.normalize_before: if not self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
) )
self.activation = Swish() self.activation = Swish()
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
x = self.activation(self.norm(x)) x = self.activation(self.norm(x))

View File

@ -246,7 +246,9 @@ class ConformerEncoderLayer(nn.Module):
residual = src residual = src
if self.normalize_before: if self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
src = residual + self.dropout(self.conv_module(src)) src = residual + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
if not self.normalize_before: if not self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
@ -877,11 +879,16 @@ class ConvolutionModule(nn.Module):
) )
self.activation = Swish() self.activation = Swish()
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -895,6 +902,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
# x is (batch, channels, time) # x is (batch, channels, time)
x = x.permute(0, 2, 1) x = x.permute(0, 2, 1)

View File

@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module):
residual = src residual = src
if self.normalize_before: if self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
src = residual + self.dropout(self.conv_module(src)) src = residual + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
if not self.normalize_before: if not self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module):
) )
self.activation = Swish() self.activation = Swish()
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
if self.use_batchnorm: if self.use_batchnorm:
x = self.norm(x) x = self.norm(x)

View File

@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module):
residual = src residual = src
if self.normalize_before: if self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
src = residual + self.dropout(self.conv_module(src)) src = residual + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
if not self.normalize_before: if not self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module):
) )
self.activation = Swish() self.activation = Swish()
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
if self.use_batchnorm: if self.use_batchnorm:
x = self.norm(x) x = self.norm(x)

View File

@ -268,7 +268,9 @@ class ConformerEncoderLayer(nn.Module):
src = src + self.dropout(src_att) src = src + self.dropout(src_att)
# convolution module # convolution module
src = src + self.dropout(self.conv_module(src)) src = src + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
# feed forward module # feed forward module
src = src + self.dropout(self.feed_forward(src)) src = src + self.dropout(self.feed_forward(src))
@ -921,11 +923,16 @@ class ConvolutionModule(nn.Module):
initial_scale=0.25, initial_scale=0.25,
) )
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -941,6 +948,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
x = self.deriv_balancer2(x) x = self.deriv_balancer2(x)

View File

@ -247,7 +247,9 @@ class ConformerEncoderLayer(nn.Module):
residual = src residual = src
if self.normalize_before: if self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
src = residual + self.dropout(self.conv_module(src)) src = residual + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
if not self.normalize_before: if not self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
@ -878,11 +880,16 @@ class ConvolutionModule(nn.Module):
) )
self.activation = Swish() self.activation = Swish()
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -896,6 +903,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
x = self.activation(self.norm(x)) x = self.activation(self.norm(x))

View File

@ -527,7 +527,9 @@ class ConformerEncoderLayer(nn.Module):
src = src + self.dropout(src_att) src = src + self.dropout(src_att)
# convolution module # convolution module
conv, _ = self.conv_module(src) conv, _ = self.conv_module(
src, src_key_padding_mask=src_key_padding_mask
)
src = src + self.dropout(conv) src = src + self.dropout(conv)
# feed forward module # feed forward module
@ -1457,6 +1459,7 @@ class ConvolutionModule(nn.Module):
x: Tensor, x: Tensor,
cache: Optional[Tensor] = None, cache: Optional[Tensor] = None,
right_context: int = 0, right_context: int = 0,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor]: ) -> Tuple[Tensor, Tensor]:
"""Compute convolution module. """Compute convolution module.
@ -1467,6 +1470,7 @@ class ConvolutionModule(nn.Module):
right_context: right_context:
How many future frames the attention can see in current chunk. How many future frames the attention can see in current chunk.
Note: It's not that each individual frame has `right_context` frames Note: It's not that each individual frame has `right_context` frames
src_key_padding_mask: the mask for the src keys per batch (optional).
of right context, some have more. of right context, some have more.
Returns: Returns:
@ -1486,6 +1490,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
if self.causal and self.lorder > 0: if self.causal and self.lorder > 0:
if cache is None: if cache is None:
# Make depthwise_conv causal by # Make depthwise_conv causal by

View File

@ -527,7 +527,9 @@ class ConformerEncoderLayer(nn.Module):
src = src + self.dropout(src_att) src = src + self.dropout(src_att)
# convolution module # convolution module
conv, _ = self.conv_module(src) conv, _ = self.conv_module(
src, src_key_padding_mask=src_key_padding_mask
)
src = src + self.dropout(conv) src = src + self.dropout(conv)
# feed forward module # feed forward module
@ -1436,7 +1438,11 @@ class ConvolutionModule(nn.Module):
) )
def forward( def forward(
self, x: Tensor, cache: Optional[Tensor] = None, right_context: int = 0 self,
x: Tensor,
cache: Optional[Tensor] = None,
right_context: int = 0,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor]: ) -> Tuple[Tensor, Tensor]:
"""Compute convolution module. """Compute convolution module.
@ -1448,6 +1454,7 @@ class ConvolutionModule(nn.Module):
How many future frames the attention can see in current chunk. How many future frames the attention can see in current chunk.
Note: It's not that each individual frame has `right_context` frames Note: It's not that each individual frame has `right_context` frames
of right context, some have more. of right context, some have more.
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -1466,6 +1473,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
if self.causal and self.lorder > 0: if self.causal and self.lorder > 0:
if cache is None: if cache is None:
# Make depthwise_conv causal by # Make depthwise_conv causal by

View File

@ -264,7 +264,9 @@ class ConformerEncoderLayer(nn.Module):
src = src + self.dropout(src_att) src = src + self.dropout(src_att)
# convolution module # convolution module
src = src + self.dropout(self.conv_module(src)) src = src + self.dropout(
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
)
# feed forward module # feed forward module
src = src + self.dropout(self.feed_forward(src)) src = src + self.dropout(self.feed_forward(src))
@ -927,11 +929,16 @@ class ConvolutionModule(nn.Module):
initial_scale=0.25, initial_scale=0.25,
) )
def forward(self, x: Tensor) -> Tensor: def forward(
self,
x: Tensor,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -947,6 +954,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
x = self.deriv_balancer2(x) x = self.deriv_balancer2(x)

View File

@ -514,7 +514,9 @@ class ConformerEncoderLayer(nn.Module):
if self.normalize_before: if self.normalize_before:
src = self.norm_conv(src) src = self.norm_conv(src)
src, _ = self.conv_module(src) src, _ = self.conv_module(
src, src_key_padding_mask=src_key_padding_mask
)
src = residual + self.dropout(src) src = residual + self.dropout(src)
if not self.normalize_before: if not self.normalize_before:
@ -1383,11 +1385,18 @@ class ConvolutionModule(nn.Module):
x: Tensor, x: Tensor,
cache: Optional[Tensor] = None, cache: Optional[Tensor] = None,
right_context: int = 0, right_context: int = 0,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor]: ) -> Tuple[Tensor, Tensor]:
"""Compute convolution module. """Compute convolution module.
Args: Args:
x: Input tensor (#time, batch, channels). x: Input tensor (#time, batch, channels).
cache: The cache of depthwise_conv, only used in real streaming
decoding.
right_context:
How many future frames the attention can see in current chunk.
Note: It's not that each individual frame has `right_context` frames
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
Tensor: Output tensor (#time, batch, channels). Tensor: Output tensor (#time, batch, channels).
@ -1401,6 +1410,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
if self.causal and self.lorder > 0: if self.causal and self.lorder > 0:
if cache is None: if cache is None:
# Make depthwise_conv causal by # Make depthwise_conv causal by

View File

@ -520,7 +520,9 @@ class ConformerEncoderLayer(nn.Module):
src = src + self.dropout(src_att) src = src + self.dropout(src_att)
# convolution module # convolution module
conv, _ = self.conv_module(src) conv, _ = self.conv_module(
src, src_key_padding_mask=src_key_padding_mask
)
src = src + self.dropout(conv) src = src + self.dropout(conv)
# feed forward module # feed forward module
@ -1392,6 +1394,7 @@ class ConvolutionModule(nn.Module):
x: Tensor, x: Tensor,
cache: Optional[Tensor] = None, cache: Optional[Tensor] = None,
right_context: int = 0, right_context: int = 0,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor]: ) -> Tuple[Tensor, Tensor]:
"""Compute convolution module. """Compute convolution module.
Args: Args:
@ -1402,6 +1405,7 @@ class ConvolutionModule(nn.Module):
How many future frames the attention can see in current chunk. How many future frames the attention can see in current chunk.
Note: It's not that each individual frame has `right_context` frames Note: It's not that each individual frame has `right_context` frames
of right context, some have more. of right context, some have more.
src_key_padding_mask: the mask for the src keys per batch (optional).
Returns: Returns:
If cache is None return the output tensor (#time, batch, channels). If cache is None return the output tensor (#time, batch, channels).
If cache is not None, return a tuple of Tensor, the first one is If cache is not None, return a tuple of Tensor, the first one is
@ -1418,6 +1422,8 @@ class ConvolutionModule(nn.Module):
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
if src_key_padding_mask is not None:
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
if self.causal and self.lorder > 0: if self.causal and self.lorder > 0:
if cache is None: if cache is None:
# Make depthwise_conv causal by # Make depthwise_conv causal by