From 3f05e474479cf445a569743f49df68b9f480531e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 27 Oct 2022 22:41:48 +0800
Subject: [PATCH] Rename conformer.py to zipformer.py

---
 .../ASR/pruned_transducer_stateless7/train.py |  2 +-
 .../{conformer.py => zipformer.py}            | 50 ++++++++++---------
 2 files changed, 27 insertions(+), 25 deletions(-)
 rename egs/librispeech/ASR/pruned_transducer_stateless7/{conformer.py => zipformer.py} (98%)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
index 66c25831f..232d64e35 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@@ -59,7 +59,7 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-from conformer import Conformer
+from zipformer import Zipformer
 from decoder import Decoder
 from joiner import Joiner
 from lhotse.cut import Cut
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
similarity index 98%
rename from egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
rename to egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
index e2cf0a051..f94501245 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@@ -43,7 +43,7 @@ from torch import Tensor, nn
 from icefall.utils import make_pad_mask
 
 
-class Conformer(EncoderInterface):
+class Zipformer(EncoderInterface):
     """
     Args:
         num_features (int): Number of input features
@@ -74,7 +74,7 @@ class Conformer(EncoderInterface):
         cnn_module_kernel: Tuple[int] = (31, 31),
         warmup_batches: float = 4000.0,
     ) -> None:
-        super(Conformer, self).__init__()
+        super(Zipformer, self).__init__()
 
         self.num_features = num_features
         self.subsampling_factor = subsampling_factor
@@ -96,7 +96,7 @@ class Conformer(EncoderInterface):
         self.encoder_embed = Conv2dSubsampling(num_features, d_model[0],
                                                dropout=dropout)
 
-        encoder_layer1 = ConformerEncoderLayer(
+        encoder_layer1 = ZipformerEncoderLayer(
             d_model[0],
             attention_dim[0],
             nhead[0],
@@ -108,14 +108,14 @@ class Conformer(EncoderInterface):
         # for the first third of the warmup period, we let the Conv2dSubsampling
         # layer learn something.  then start warmup up the first and then the second
         # encoder.
-        self.encoder1 = ConformerEncoder(
+        self.encoder1 = ZipformerEncoder(
             encoder_layer1,
             num_encoder_layers[0],
             dropout,
             warmup_begin=warmup_batches / 3,
             warmup_end=warmup_batches * 2 / 3,
         )
-        encoder_layer2 = ConformerEncoderLayer(
+        encoder_layer2 = ZipformerEncoderLayer(
             d_model[1],
             attention_dim[1],
             nhead[1],
@@ -124,8 +124,8 @@ class Conformer(EncoderInterface):
             cnn_module_kernel[1],
 
         )
-        self.encoder2 = DownsampledConformerEncoder(
-            ConformerEncoder(
+        self.encoder2 = DownsampledZipformerEncoder(
+            ZipformerEncoder(
                 encoder_layer2,
                 num_encoder_layers[1],
                 dropout,
@@ -237,10 +237,10 @@ class Conformer(EncoderInterface):
         return x, lengths
 
 
-class ConformerEncoderLayer(nn.Module):
+class ZipformerEncoderLayer(nn.Module):
     """
-    ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.
-    See: "Conformer: Convolution-augmented Transformer for Speech Recognition"
+    ZipformerEncoderLayer is made up of self-attn, feedforward and convolution networks.
+    See: "Zipformer: Convolution-augmented Transformer for Speech Recognition"
 
     Args:
         d_model: the number of expected features in the input (required).
@@ -250,7 +250,7 @@ class ConformerEncoderLayer(nn.Module):
         cnn_module_kernel (int): Kernel size of convolution module.
 
     Examples::
-        >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
+        >>> encoder_layer = ZipformerEncoderLayer(d_model=512, nhead=8)
         >>> src = torch.rand(10, 32, 512)
         >>> pos_emb = torch.rand(32, 19, 512)
         >>> out = encoder_layer(src, pos_emb)
@@ -264,7 +264,7 @@ class ConformerEncoderLayer(nn.Module):
             dropout: float = 0.1,
             cnn_module_kernel: int = 31,
     ) -> None:
-        super(ConformerEncoderLayer, self).__init__()
+        super(ZipformerEncoderLayer, self).__init__()
 
         self.d_model = d_model
 
@@ -371,16 +371,16 @@ class ConformerEncoderLayer(nn.Module):
         return self.whiten(src)
 
 
-class ConformerEncoder(nn.Module):
-    r"""ConformerEncoder is a stack of N encoder layers
+class ZipformerEncoder(nn.Module):
+    r"""ZipformerEncoder is a stack of N encoder layers
 
     Args:
-        encoder_layer: an instance of the ConformerEncoderLayer() class (required).
+        encoder_layer: an instance of the ZipformerEncoderLayer() class (required).
         num_layers: the number of sub-encoder-layers in the encoder (required).
 
     Examples::
-        >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
-        >>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6)
+        >>> encoder_layer = ZipformerEncoderLayer(d_model=512, nhead=8)
+        >>> conformer_encoder = ZipformerEncoder(encoder_layer, num_layers=6)
         >>> src = torch.rand(10, 32, 512)
         >>> out = conformer_encoder(src)
     """
@@ -553,9 +553,9 @@ class ConformerEncoder(nn.Module):
         return output
 
 
-class DownsampledConformerEncoder(nn.Module):
+class DownsampledZipformerEncoder(nn.Module):
     r"""
-    DownsampledConformerEncoder is a conformer encoder evaluated at a reduced frame rate,
+    DownsampledZipformerEncoder is a conformer encoder evaluated at a reduced frame rate,
     after convolutional downsampling, and then upsampled again at the output
     so that the output has the same shape as the input.
     """
@@ -564,7 +564,7 @@ class DownsampledConformerEncoder(nn.Module):
                  input_dim: int,
                  output_dim: int,
                  downsample: int):
-        super(DownsampledConformerEncoder, self).__init__()
+        super(DownsampledZipformerEncoder, self).__init__()
         self.downsample_factor = downsample
         self.downsample = AttentionDownsample(input_dim, output_dim, downsample)
         self.encoder = encoder
@@ -833,7 +833,9 @@ class RelPositionalEncoding(torch.nn.Module):
 class RelPositionMultiheadAttention(nn.Module):
     r"""Multi-Head Attention layer with relative position encoding
 
-    See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+    This is a quite heavily modified from: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context",
+    we have to write up the differences.
+
 
     Args:
         embed_dim: total dimension of the model.
@@ -1268,7 +1270,7 @@ class RelPositionMultiheadAttention(nn.Module):
 
 
 class FeedforwardModule(nn.Module):
-    """Feedforward module in Conformer model.
+    """Feedforward module in Zipformer model.
     """
     def __init__(self,
                  d_model: int,
@@ -1295,7 +1297,7 @@ class FeedforwardModule(nn.Module):
 
 
 class ConvolutionModule(nn.Module):
-    """ConvolutionModule in Conformer model.
+    """ConvolutionModule in Zipformer model.
     Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py
 
     Args:
@@ -1639,7 +1641,7 @@ def _test_conformer_main():
     feature_dim = 50
     # Just make sure the forward pass runs.
 
-    c = Conformer(
+    c = Zipformer(
         num_features=feature_dim, d_model=(64,96), encoder_unmasked_dim=64, nhead=(4,4)
     )
     batch_size = 5