From 61486a0f76d79e941257f87efc7b10188fb48b44 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 6 Apr 2022 13:17:26 +0800 Subject: [PATCH] Remove initial_speed --- .../ASR/pruned_transducer_stateless2/conformer.py | 8 -------- .../ASR/pruned_transducer_stateless2/decoder.py | 6 ------ .../ASR/pruned_transducer_stateless2/joiner.py | 3 --- 3 files changed, 17 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py index 4797cce08..94c6aa90c 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py @@ -956,30 +956,22 @@ class Conv2dSubsampling(nn.Module): assert in_channels >= 7 super().__init__() - # This initial_speed is to slightly slow down the relative speed of - # training during the warmup phase by increasing the magnitude of the - # initial parameter values. The intention is to allow us to - # use a higher lr_factor. - initial_speed = 0.5 self.conv = nn.Sequential( ScaledConv2d( in_channels=1, out_channels=layer1_channels, kernel_size=3, padding=1, - initial_speed=initial_speed, ), ActivationBalancer(channel_dim=1), DoubleSwish(), ScaledConv2d( in_channels=layer1_channels, out_channels=layer2_channels, kernel_size=3, stride=2, - initial_speed=initial_speed, ), ActivationBalancer(channel_dim=1), DoubleSwish(), ScaledConv2d( in_channels=layer2_channels, out_channels=layer3_channels, kernel_size=3, stride=2, - initial_speed=initial_speed, ), ActivationBalancer(channel_dim=1), DoubleSwish(), diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py b/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py index 3291ad877..c23568ae9 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py @@ -56,16 +56,10 @@ class Decoder(nn.Module): """ super().__init__() - # This initial_speed is to slightly slow down the relative speed of - # training during the warmup phase by increasing the magnitude of the - # initial parameter values. The intention is to allow us to - # use a higher lr_factor. - initial_speed = 0.5 self.embedding = ScaledEmbedding( num_embeddings=vocab_size, embedding_dim=decoder_dim, padding_idx=blank_id, - initial_speed=initial_speed ) self.blank_id = blank_id diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py b/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py index 752a5f774..2299a0a8c 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py @@ -27,9 +27,6 @@ class Joiner(nn.Module): vocab_size: int): super().__init__() - # We don't bother giving the 'initial_speed' arg to the decoder - # submodules, because it does not affect the initial convergence of the - # system (only the simple joiner is involved in that). self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim) self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim) self.output_linear = ScaledLinear(joiner_dim, vocab_size)