Rename conformer.py to zipformer.py
This commit is contained in:
parent
be5c687fbd
commit
3f05e47447
@ -59,7 +59,7 @@ import torch
|
|||||||
import torch.multiprocessing as mp
|
import torch.multiprocessing as mp
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from asr_datamodule import LibriSpeechAsrDataModule
|
from asr_datamodule import LibriSpeechAsrDataModule
|
||||||
from conformer import Conformer
|
from zipformer import Zipformer
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
from joiner import Joiner
|
from joiner import Joiner
|
||||||
from lhotse.cut import Cut
|
from lhotse.cut import Cut
|
||||||
|
|||||||
@ -43,7 +43,7 @@ from torch import Tensor, nn
|
|||||||
from icefall.utils import make_pad_mask
|
from icefall.utils import make_pad_mask
|
||||||
|
|
||||||
|
|
||||||
class Conformer(EncoderInterface):
|
class Zipformer(EncoderInterface):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
num_features (int): Number of input features
|
num_features (int): Number of input features
|
||||||
@ -74,7 +74,7 @@ class Conformer(EncoderInterface):
|
|||||||
cnn_module_kernel: Tuple[int] = (31, 31),
|
cnn_module_kernel: Tuple[int] = (31, 31),
|
||||||
warmup_batches: float = 4000.0,
|
warmup_batches: float = 4000.0,
|
||||||
) -> None:
|
) -> None:
|
||||||
super(Conformer, self).__init__()
|
super(Zipformer, self).__init__()
|
||||||
|
|
||||||
self.num_features = num_features
|
self.num_features = num_features
|
||||||
self.subsampling_factor = subsampling_factor
|
self.subsampling_factor = subsampling_factor
|
||||||
@ -96,7 +96,7 @@ class Conformer(EncoderInterface):
|
|||||||
self.encoder_embed = Conv2dSubsampling(num_features, d_model[0],
|
self.encoder_embed = Conv2dSubsampling(num_features, d_model[0],
|
||||||
dropout=dropout)
|
dropout=dropout)
|
||||||
|
|
||||||
encoder_layer1 = ConformerEncoderLayer(
|
encoder_layer1 = ZipformerEncoderLayer(
|
||||||
d_model[0],
|
d_model[0],
|
||||||
attention_dim[0],
|
attention_dim[0],
|
||||||
nhead[0],
|
nhead[0],
|
||||||
@ -108,14 +108,14 @@ class Conformer(EncoderInterface):
|
|||||||
# for the first third of the warmup period, we let the Conv2dSubsampling
|
# for the first third of the warmup period, we let the Conv2dSubsampling
|
||||||
# layer learn something. then start warmup up the first and then the second
|
# layer learn something. then start warmup up the first and then the second
|
||||||
# encoder.
|
# encoder.
|
||||||
self.encoder1 = ConformerEncoder(
|
self.encoder1 = ZipformerEncoder(
|
||||||
encoder_layer1,
|
encoder_layer1,
|
||||||
num_encoder_layers[0],
|
num_encoder_layers[0],
|
||||||
dropout,
|
dropout,
|
||||||
warmup_begin=warmup_batches / 3,
|
warmup_begin=warmup_batches / 3,
|
||||||
warmup_end=warmup_batches * 2 / 3,
|
warmup_end=warmup_batches * 2 / 3,
|
||||||
)
|
)
|
||||||
encoder_layer2 = ConformerEncoderLayer(
|
encoder_layer2 = ZipformerEncoderLayer(
|
||||||
d_model[1],
|
d_model[1],
|
||||||
attention_dim[1],
|
attention_dim[1],
|
||||||
nhead[1],
|
nhead[1],
|
||||||
@ -124,8 +124,8 @@ class Conformer(EncoderInterface):
|
|||||||
cnn_module_kernel[1],
|
cnn_module_kernel[1],
|
||||||
|
|
||||||
)
|
)
|
||||||
self.encoder2 = DownsampledConformerEncoder(
|
self.encoder2 = DownsampledZipformerEncoder(
|
||||||
ConformerEncoder(
|
ZipformerEncoder(
|
||||||
encoder_layer2,
|
encoder_layer2,
|
||||||
num_encoder_layers[1],
|
num_encoder_layers[1],
|
||||||
dropout,
|
dropout,
|
||||||
@ -237,10 +237,10 @@ class Conformer(EncoderInterface):
|
|||||||
return x, lengths
|
return x, lengths
|
||||||
|
|
||||||
|
|
||||||
class ConformerEncoderLayer(nn.Module):
|
class ZipformerEncoderLayer(nn.Module):
|
||||||
"""
|
"""
|
||||||
ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.
|
ZipformerEncoderLayer is made up of self-attn, feedforward and convolution networks.
|
||||||
See: "Conformer: Convolution-augmented Transformer for Speech Recognition"
|
See: "Zipformer: Convolution-augmented Transformer for Speech Recognition"
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
d_model: the number of expected features in the input (required).
|
d_model: the number of expected features in the input (required).
|
||||||
@ -250,7 +250,7 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
cnn_module_kernel (int): Kernel size of convolution module.
|
cnn_module_kernel (int): Kernel size of convolution module.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
>>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
|
>>> encoder_layer = ZipformerEncoderLayer(d_model=512, nhead=8)
|
||||||
>>> src = torch.rand(10, 32, 512)
|
>>> src = torch.rand(10, 32, 512)
|
||||||
>>> pos_emb = torch.rand(32, 19, 512)
|
>>> pos_emb = torch.rand(32, 19, 512)
|
||||||
>>> out = encoder_layer(src, pos_emb)
|
>>> out = encoder_layer(src, pos_emb)
|
||||||
@ -264,7 +264,7 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
dropout: float = 0.1,
|
dropout: float = 0.1,
|
||||||
cnn_module_kernel: int = 31,
|
cnn_module_kernel: int = 31,
|
||||||
) -> None:
|
) -> None:
|
||||||
super(ConformerEncoderLayer, self).__init__()
|
super(ZipformerEncoderLayer, self).__init__()
|
||||||
|
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
|
|
||||||
@ -371,16 +371,16 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
return self.whiten(src)
|
return self.whiten(src)
|
||||||
|
|
||||||
|
|
||||||
class ConformerEncoder(nn.Module):
|
class ZipformerEncoder(nn.Module):
|
||||||
r"""ConformerEncoder is a stack of N encoder layers
|
r"""ZipformerEncoder is a stack of N encoder layers
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
encoder_layer: an instance of the ConformerEncoderLayer() class (required).
|
encoder_layer: an instance of the ZipformerEncoderLayer() class (required).
|
||||||
num_layers: the number of sub-encoder-layers in the encoder (required).
|
num_layers: the number of sub-encoder-layers in the encoder (required).
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
>>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
|
>>> encoder_layer = ZipformerEncoderLayer(d_model=512, nhead=8)
|
||||||
>>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6)
|
>>> conformer_encoder = ZipformerEncoder(encoder_layer, num_layers=6)
|
||||||
>>> src = torch.rand(10, 32, 512)
|
>>> src = torch.rand(10, 32, 512)
|
||||||
>>> out = conformer_encoder(src)
|
>>> out = conformer_encoder(src)
|
||||||
"""
|
"""
|
||||||
@ -553,9 +553,9 @@ class ConformerEncoder(nn.Module):
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
class DownsampledConformerEncoder(nn.Module):
|
class DownsampledZipformerEncoder(nn.Module):
|
||||||
r"""
|
r"""
|
||||||
DownsampledConformerEncoder is a conformer encoder evaluated at a reduced frame rate,
|
DownsampledZipformerEncoder is a conformer encoder evaluated at a reduced frame rate,
|
||||||
after convolutional downsampling, and then upsampled again at the output
|
after convolutional downsampling, and then upsampled again at the output
|
||||||
so that the output has the same shape as the input.
|
so that the output has the same shape as the input.
|
||||||
"""
|
"""
|
||||||
@ -564,7 +564,7 @@ class DownsampledConformerEncoder(nn.Module):
|
|||||||
input_dim: int,
|
input_dim: int,
|
||||||
output_dim: int,
|
output_dim: int,
|
||||||
downsample: int):
|
downsample: int):
|
||||||
super(DownsampledConformerEncoder, self).__init__()
|
super(DownsampledZipformerEncoder, self).__init__()
|
||||||
self.downsample_factor = downsample
|
self.downsample_factor = downsample
|
||||||
self.downsample = AttentionDownsample(input_dim, output_dim, downsample)
|
self.downsample = AttentionDownsample(input_dim, output_dim, downsample)
|
||||||
self.encoder = encoder
|
self.encoder = encoder
|
||||||
@ -833,7 +833,9 @@ class RelPositionalEncoding(torch.nn.Module):
|
|||||||
class RelPositionMultiheadAttention(nn.Module):
|
class RelPositionMultiheadAttention(nn.Module):
|
||||||
r"""Multi-Head Attention layer with relative position encoding
|
r"""Multi-Head Attention layer with relative position encoding
|
||||||
|
|
||||||
See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
|
This is a quite heavily modified from: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context",
|
||||||
|
we have to write up the differences.
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
embed_dim: total dimension of the model.
|
embed_dim: total dimension of the model.
|
||||||
@ -1268,7 +1270,7 @@ class RelPositionMultiheadAttention(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class FeedforwardModule(nn.Module):
|
class FeedforwardModule(nn.Module):
|
||||||
"""Feedforward module in Conformer model.
|
"""Feedforward module in Zipformer model.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
d_model: int,
|
d_model: int,
|
||||||
@ -1295,7 +1297,7 @@ class FeedforwardModule(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class ConvolutionModule(nn.Module):
|
class ConvolutionModule(nn.Module):
|
||||||
"""ConvolutionModule in Conformer model.
|
"""ConvolutionModule in Zipformer model.
|
||||||
Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py
|
Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1639,7 +1641,7 @@ def _test_conformer_main():
|
|||||||
feature_dim = 50
|
feature_dim = 50
|
||||||
# Just make sure the forward pass runs.
|
# Just make sure the forward pass runs.
|
||||||
|
|
||||||
c = Conformer(
|
c = Zipformer(
|
||||||
num_features=feature_dim, d_model=(64,96), encoder_unmasked_dim=64, nhead=(4,4)
|
num_features=feature_dim, d_model=(64,96), encoder_unmasked_dim=64, nhead=(4,4)
|
||||||
)
|
)
|
||||||
batch_size = 5
|
batch_size = 5
|
||||||
Reference in New Issue
Block a user