Rename conformer.py to zipformer.py

This commit is contained in:
Daniel Povey 2022-10-27 22:41:48 +08:00
parent be5c687fbd
commit 3f05e47447
2 changed files with 27 additions and 25 deletions

View File

@ -59,7 +59,7 @@ import torch
import torch.multiprocessing as mp import torch.multiprocessing as mp
import torch.nn as nn import torch.nn as nn
from asr_datamodule import LibriSpeechAsrDataModule from asr_datamodule import LibriSpeechAsrDataModule
from conformer import Conformer from zipformer import Zipformer
from decoder import Decoder from decoder import Decoder
from joiner import Joiner from joiner import Joiner
from lhotse.cut import Cut from lhotse.cut import Cut

View File

@ -43,7 +43,7 @@ from torch import Tensor, nn
from icefall.utils import make_pad_mask from icefall.utils import make_pad_mask
class Conformer(EncoderInterface): class Zipformer(EncoderInterface):
""" """
Args: Args:
num_features (int): Number of input features num_features (int): Number of input features
@ -74,7 +74,7 @@ class Conformer(EncoderInterface):
cnn_module_kernel: Tuple[int] = (31, 31), cnn_module_kernel: Tuple[int] = (31, 31),
warmup_batches: float = 4000.0, warmup_batches: float = 4000.0,
) -> None: ) -> None:
super(Conformer, self).__init__() super(Zipformer, self).__init__()
self.num_features = num_features self.num_features = num_features
self.subsampling_factor = subsampling_factor self.subsampling_factor = subsampling_factor
@ -96,7 +96,7 @@ class Conformer(EncoderInterface):
self.encoder_embed = Conv2dSubsampling(num_features, d_model[0], self.encoder_embed = Conv2dSubsampling(num_features, d_model[0],
dropout=dropout) dropout=dropout)
encoder_layer1 = ConformerEncoderLayer( encoder_layer1 = ZipformerEncoderLayer(
d_model[0], d_model[0],
attention_dim[0], attention_dim[0],
nhead[0], nhead[0],
@ -108,14 +108,14 @@ class Conformer(EncoderInterface):
# for the first third of the warmup period, we let the Conv2dSubsampling # for the first third of the warmup period, we let the Conv2dSubsampling
# layer learn something. then start warmup up the first and then the second # layer learn something. then start warmup up the first and then the second
# encoder. # encoder.
self.encoder1 = ConformerEncoder( self.encoder1 = ZipformerEncoder(
encoder_layer1, encoder_layer1,
num_encoder_layers[0], num_encoder_layers[0],
dropout, dropout,
warmup_begin=warmup_batches / 3, warmup_begin=warmup_batches / 3,
warmup_end=warmup_batches * 2 / 3, warmup_end=warmup_batches * 2 / 3,
) )
encoder_layer2 = ConformerEncoderLayer( encoder_layer2 = ZipformerEncoderLayer(
d_model[1], d_model[1],
attention_dim[1], attention_dim[1],
nhead[1], nhead[1],
@ -124,8 +124,8 @@ class Conformer(EncoderInterface):
cnn_module_kernel[1], cnn_module_kernel[1],
) )
self.encoder2 = DownsampledConformerEncoder( self.encoder2 = DownsampledZipformerEncoder(
ConformerEncoder( ZipformerEncoder(
encoder_layer2, encoder_layer2,
num_encoder_layers[1], num_encoder_layers[1],
dropout, dropout,
@ -237,10 +237,10 @@ class Conformer(EncoderInterface):
return x, lengths return x, lengths
class ConformerEncoderLayer(nn.Module): class ZipformerEncoderLayer(nn.Module):
""" """
ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks. ZipformerEncoderLayer is made up of self-attn, feedforward and convolution networks.
See: "Conformer: Convolution-augmented Transformer for Speech Recognition" See: "Zipformer: Convolution-augmented Transformer for Speech Recognition"
Args: Args:
d_model: the number of expected features in the input (required). d_model: the number of expected features in the input (required).
@ -250,7 +250,7 @@ class ConformerEncoderLayer(nn.Module):
cnn_module_kernel (int): Kernel size of convolution module. cnn_module_kernel (int): Kernel size of convolution module.
Examples:: Examples::
>>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) >>> encoder_layer = ZipformerEncoderLayer(d_model=512, nhead=8)
>>> src = torch.rand(10, 32, 512) >>> src = torch.rand(10, 32, 512)
>>> pos_emb = torch.rand(32, 19, 512) >>> pos_emb = torch.rand(32, 19, 512)
>>> out = encoder_layer(src, pos_emb) >>> out = encoder_layer(src, pos_emb)
@ -264,7 +264,7 @@ class ConformerEncoderLayer(nn.Module):
dropout: float = 0.1, dropout: float = 0.1,
cnn_module_kernel: int = 31, cnn_module_kernel: int = 31,
) -> None: ) -> None:
super(ConformerEncoderLayer, self).__init__() super(ZipformerEncoderLayer, self).__init__()
self.d_model = d_model self.d_model = d_model
@ -371,16 +371,16 @@ class ConformerEncoderLayer(nn.Module):
return self.whiten(src) return self.whiten(src)
class ConformerEncoder(nn.Module): class ZipformerEncoder(nn.Module):
r"""ConformerEncoder is a stack of N encoder layers r"""ZipformerEncoder is a stack of N encoder layers
Args: Args:
encoder_layer: an instance of the ConformerEncoderLayer() class (required). encoder_layer: an instance of the ZipformerEncoderLayer() class (required).
num_layers: the number of sub-encoder-layers in the encoder (required). num_layers: the number of sub-encoder-layers in the encoder (required).
Examples:: Examples::
>>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) >>> encoder_layer = ZipformerEncoderLayer(d_model=512, nhead=8)
>>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6) >>> conformer_encoder = ZipformerEncoder(encoder_layer, num_layers=6)
>>> src = torch.rand(10, 32, 512) >>> src = torch.rand(10, 32, 512)
>>> out = conformer_encoder(src) >>> out = conformer_encoder(src)
""" """
@ -553,9 +553,9 @@ class ConformerEncoder(nn.Module):
return output return output
class DownsampledConformerEncoder(nn.Module): class DownsampledZipformerEncoder(nn.Module):
r""" r"""
DownsampledConformerEncoder is a conformer encoder evaluated at a reduced frame rate, DownsampledZipformerEncoder is a conformer encoder evaluated at a reduced frame rate,
after convolutional downsampling, and then upsampled again at the output after convolutional downsampling, and then upsampled again at the output
so that the output has the same shape as the input. so that the output has the same shape as the input.
""" """
@ -564,7 +564,7 @@ class DownsampledConformerEncoder(nn.Module):
input_dim: int, input_dim: int,
output_dim: int, output_dim: int,
downsample: int): downsample: int):
super(DownsampledConformerEncoder, self).__init__() super(DownsampledZipformerEncoder, self).__init__()
self.downsample_factor = downsample self.downsample_factor = downsample
self.downsample = AttentionDownsample(input_dim, output_dim, downsample) self.downsample = AttentionDownsample(input_dim, output_dim, downsample)
self.encoder = encoder self.encoder = encoder
@ -833,7 +833,9 @@ class RelPositionalEncoding(torch.nn.Module):
class RelPositionMultiheadAttention(nn.Module): class RelPositionMultiheadAttention(nn.Module):
r"""Multi-Head Attention layer with relative position encoding r"""Multi-Head Attention layer with relative position encoding
See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" This is a quite heavily modified from: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context",
we have to write up the differences.
Args: Args:
embed_dim: total dimension of the model. embed_dim: total dimension of the model.
@ -1268,7 +1270,7 @@ class RelPositionMultiheadAttention(nn.Module):
class FeedforwardModule(nn.Module): class FeedforwardModule(nn.Module):
"""Feedforward module in Conformer model. """Feedforward module in Zipformer model.
""" """
def __init__(self, def __init__(self,
d_model: int, d_model: int,
@ -1295,7 +1297,7 @@ class FeedforwardModule(nn.Module):
class ConvolutionModule(nn.Module): class ConvolutionModule(nn.Module):
"""ConvolutionModule in Conformer model. """ConvolutionModule in Zipformer model.
Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py
Args: Args:
@ -1639,7 +1641,7 @@ def _test_conformer_main():
feature_dim = 50 feature_dim = 50
# Just make sure the forward pass runs. # Just make sure the forward pass runs.
c = Conformer( c = Zipformer(
num_features=feature_dim, d_model=(64,96), encoder_unmasked_dim=64, nhead=(4,4) num_features=feature_dim, d_model=(64,96), encoder_unmasked_dim=64, nhead=(4,4)
) )
batch_size = 5 batch_size = 5