edit comments (#852)

This commit is contained in:
Teo Wen Shen 2023-01-28 14:47:21 +09:00 committed by GitHub
parent 6b1ab71dc9
commit 1ce2bc1ee0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 18 deletions

View File

@ -197,13 +197,13 @@ class Zipformer(EncoderInterface):
""" """
In eval mode, returns [1.0] * num_encoders; in training mode, returns a number of In eval mode, returns [1.0] * num_encoders; in training mode, returns a number of
randomized feature masks, one per encoder. randomized feature masks, one per encoder.
On e.g. 15% of frames, these masks will zero out all enocder dims larger than On e.g. 15% of frames, these masks will zero out all encoder dims larger than
some supplied number, e.g. >256, so in effect on those frames we are using some supplied number, e.g. >256, so in effect on those frames we are using
a smaller encoer dim. a smaller encoder dim.
We generate the random masks at this level because we want the 2 masks to 'agree' We generate the random masks at this level because we want the 2 masks to 'agree'
all the way up the encoder stack. This will mean that the 1st mask will have all the way up the encoder stack. This will mean that the 1st mask will have
mask values repeated self.zipformer_subsampling_factor times. mask values repeated self.zipformer_downsampling_factors times.
Args: Args:
x: the embeddings (needed for the shape and dtype and device), of shape x: the embeddings (needed for the shape and dtype and device), of shape
@ -1009,10 +1009,10 @@ class RelPositionMultiheadAttention(nn.Module):
# the initial_scale is supposed to take over the "scaling" factor of # the initial_scale is supposed to take over the "scaling" factor of
# head_dim ** -0.5, dividing it between the query and key. # head_dim ** -0.5, dividing it between the query and key.
in_proj_dim = ( in_proj_dim = (
2 * attention_dim 2 * attention_dim # query, key
+ attention_dim // 2 # query, key + attention_dim // 2 # value
+ pos_dim * num_heads # value + pos_dim * num_heads # positional encoding query
) # positional encoding query )
self.in_proj = ScaledLinear( self.in_proj = ScaledLinear(
embed_dim, in_proj_dim, bias=True, initial_scale=self.head_dim**-0.25 embed_dim, in_proj_dim, bias=True, initial_scale=self.head_dim**-0.25
@ -1509,7 +1509,7 @@ class FeedforwardModule(nn.Module):
class ConvolutionModule(nn.Module): class ConvolutionModule(nn.Module):
"""ConvolutionModule in Zipformer model. """ConvolutionModule in Zipformer model.
Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/zipformer/convolution.py Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py
Args: Args:
channels (int): The number of channels of conv layers. channels (int): The number of channels of conv layers.

View File

@ -421,13 +421,13 @@ class Zipformer(EncoderInterface):
""" """
In eval mode, returns [1.0] * num_encoders; in training mode, returns a number of In eval mode, returns [1.0] * num_encoders; in training mode, returns a number of
randomized feature masks, one per encoder. randomized feature masks, one per encoder.
On e.g. 15% of frames, these masks will zero out all enocder dims larger than On e.g. 15% of frames, these masks will zero out all encoder dims larger than
some supplied number, e.g. >256, so in effect on those frames we are using some supplied number, e.g. >256, so in effect on those frames we are using
a smaller encoer dim. a smaller encoder dim.
We generate the random masks at this level because we want the 2 masks to 'agree' We generate the random masks at this level because we want the 2 masks to 'agree'
all the way up the encoder stack. This will mean that the 1st mask will have all the way up the encoder stack. This will mean that the 1st mask will have
mask values repeated self.zipformer_subsampling_factor times. mask values repeated self.zipformer_downsampling_factors times.
Args: Args:
x: the embeddings (needed for the shape and dtype and device), of shape x: the embeddings (needed for the shape and dtype and device), of shape
@ -1687,8 +1687,8 @@ class RelPositionalEncoding(torch.nn.Module):
if self.pe.dtype != x.dtype or str(self.pe.device) != str(x.device): if self.pe.dtype != x.dtype or str(self.pe.device) != str(x.device):
self.pe = self.pe.to(dtype=x.dtype, device=x.device) self.pe = self.pe.to(dtype=x.dtype, device=x.device)
return return
# Suppose `i` means to the position of query vecotr and `j` means the # Suppose `i` means to the position of query vector and `j` means the
# position of key vector. We use position relative positions when keys # position of key vector. We use positive relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j). # are to the left (i>j) and negative relative positions otherwise (i<j).
pe_positive = torch.zeros(x_size_left, self.d_model) pe_positive = torch.zeros(x_size_left, self.d_model)
pe_negative = torch.zeros(x_size_left, self.d_model) pe_negative = torch.zeros(x_size_left, self.d_model)
@ -1778,10 +1778,10 @@ class RelPositionMultiheadAttention(nn.Module):
# the initial_scale is supposed to take over the "scaling" factor of # the initial_scale is supposed to take over the "scaling" factor of
# head_dim ** -0.5, dividing it between the query and key. # head_dim ** -0.5, dividing it between the query and key.
in_proj_dim = ( in_proj_dim = (
2 * attention_dim 2 * attention_dim # query, key
+ attention_dim // 2 # query, key + attention_dim // 2 # value
+ pos_dim * num_heads # value + pos_dim * num_heads # positional encoding query
) # positional encoding query )
self.in_proj = ScaledLinear( self.in_proj = ScaledLinear(
embed_dim, in_proj_dim, bias=True, initial_scale=self.head_dim**-0.25 embed_dim, in_proj_dim, bias=True, initial_scale=self.head_dim**-0.25
@ -2536,7 +2536,7 @@ class FeedforwardModule(nn.Module):
class ConvolutionModule(nn.Module): class ConvolutionModule(nn.Module):
"""ConvolutionModule in Zipformer model. """ConvolutionModule in Zipformer model.
Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/zipformer/convolution.py Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py
Args: Args:
channels (int): The number of channels of conv layers. channels (int): The number of channels of conv layers.