From 61486a0f76d79e941257f87efc7b10188fb48b44 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 6 Apr 2022 13:17:26 +0800
Subject: [PATCH] Remove initial_speed

---
 .../ASR/pruned_transducer_stateless2/conformer.py         | 8 --------
 .../ASR/pruned_transducer_stateless2/decoder.py           | 6 ------
 .../ASR/pruned_transducer_stateless2/joiner.py            | 3 ---
 3 files changed, 17 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
index 4797cce08..94c6aa90c 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@@ -956,30 +956,22 @@ class Conv2dSubsampling(nn.Module):
         assert in_channels >= 7
         super().__init__()
 
-        # This initial_speed is to slightly slow down the relative speed of
-        # training during the warmup phase by increasing the magnitude of the
-        # initial parameter values.  The intention is to allow us to
-        # use a higher lr_factor.
-        initial_speed = 0.5
         self.conv = nn.Sequential(
             ScaledConv2d(
                 in_channels=1, out_channels=layer1_channels,
                 kernel_size=3, padding=1,
-                initial_speed=initial_speed,
             ),
             ActivationBalancer(channel_dim=1),
             DoubleSwish(),
             ScaledConv2d(
                 in_channels=layer1_channels, out_channels=layer2_channels,
                 kernel_size=3, stride=2,
-                initial_speed=initial_speed,
             ),
             ActivationBalancer(channel_dim=1),
             DoubleSwish(),
             ScaledConv2d(
                 in_channels=layer2_channels, out_channels=layer3_channels,
                 kernel_size=3, stride=2,
-                initial_speed=initial_speed,
             ),
             ActivationBalancer(channel_dim=1),
             DoubleSwish(),
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py b/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py
index 3291ad877..c23568ae9 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py
@@ -56,16 +56,10 @@ class Decoder(nn.Module):
         """
         super().__init__()
 
-        # This initial_speed is to slightly slow down the relative speed of
-        # training during the warmup phase by increasing the magnitude of the
-        # initial parameter values.  The intention is to allow us to
-        # use a higher lr_factor.
-        initial_speed = 0.5
         self.embedding = ScaledEmbedding(
             num_embeddings=vocab_size,
             embedding_dim=decoder_dim,
             padding_idx=blank_id,
-            initial_speed=initial_speed
         )
         self.blank_id = blank_id
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py b/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py
index 752a5f774..2299a0a8c 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py
@@ -27,9 +27,6 @@ class Joiner(nn.Module):
                  vocab_size: int):
         super().__init__()
 
-        # We don't bother giving the 'initial_speed' arg to the decoder
-        # submodules, because it does not affect the initial convergence of the
-        # system (only the simple joiner is involved in that).
         self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim)
         self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim)
         self.output_linear = ScaledLinear(joiner_dim, vocab_size)