From bac72718f0d980eae4661cab8c86c05170d040aa Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 12 Jan 2023 22:11:42 +0800 Subject: [PATCH] Bug fixes, config changes --- .../ASR/pruned_transducer_stateless7/train.py | 8 ++++---- .../ASR/pruned_transducer_stateless7/zipformer.py | 13 +++++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index 0f5aad60b..6ca21789c 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -123,7 +123,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): parser.add_argument( "--num-encoder-layers", type=str, - default="2,2,4,6,4,2", + default="2,4,4,4,4,4", help="Number of zipformer encoder layers per stack, comma separated.", ) @@ -139,7 +139,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): parser.add_argument( "--feedforward-dim", type=str, - default="384,768,1024,1536,1024,768", + default="384,512,1024,1536,1024,512", help="Feedforward dimension of the zipformer encoder layers, per stack, comma separated.", ) @@ -160,7 +160,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): parser.add_argument( "--encoder-dim", type=str, - default="192,256,320,384,320,256", + default="192,192,256,320,256,192", help="Embedding dimension in encoder stacks: a single int or comma-separated list." ) @@ -195,7 +195,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): parser.add_argument( "--encoder-unmasked-dim", type=str, - default="164,192,256,256,256,192", + default="192,192,224,224,224,192", help="Unmasked dimensions in the encoders, relates to augmentation during training. " "A single int or comma-separated list. Must be <= each corresponding encoder_dim." ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index c7382b424..6b61ed500 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -215,6 +215,7 @@ class Zipformer(EncoderInterface): encoder.lr_scale = downsampling_factor[i] ** -0.33 encoders.append(encoder) + self.encoders = nn.ModuleList(encoders) # initializes self.skip_layers and self.skip_modules @@ -327,8 +328,12 @@ class Zipformer(EncoderInterface): - lengths, a tensor of shape (batch_size,) containing the number of frames in `embeddings` before padding. """ + logging.info(f"Memory allocated at entry: {torch.cuda.memory_allocated() // 1000000}M") + x = self.encoder_embed(x) + logging.info(f"Memory allocated after encoder_embed: {torch.cuda.memory_allocated() // 1000000}M") + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) with warnings.catch_warnings(): @@ -358,6 +363,7 @@ class Zipformer(EncoderInterface): feature_mask=feature_masks[i], src_key_padding_mask=None if mask is None else mask[...,::ds]) outputs.append(x) + logging.info(f"Memory allocated after stack {i}: {torch.cuda.memory_allocated() // 1000000}M") x = self.downsample_output(x) # class Downsample has this rounding behavior.. @@ -834,6 +840,7 @@ class SimpleDownsample(torch.nn.Module): else: self.extra_proj = None self.downsample = downsample + self.out_channels = out_channels def forward(self, src: Tensor) -> Tensor: @@ -867,6 +874,8 @@ class SimpleDownsample(torch.nn.Module): if self.extra_proj is not None: ans2 = self.extra_proj(src) ans = torch.cat((ans, ans2), dim=2) + + ans = ans[..., :self.out_channels] return ans @@ -941,7 +950,7 @@ class SimpleCombiner(torch.nn.Module): dtype=src1.dtype)), dim=-1) else: - src1 = src1[:src2_dim] + src1 = src1[...,:src2_dim] src1 = src1 * weight1 src2 = src2 * (1.0 - weight1) @@ -1917,7 +1926,7 @@ class Conv2dSubsampling(nn.Module): out_channels: int, layer1_channels: int = 8, layer2_channels: int = 32, - layer3_channels: int = 96, + layer3_channels: int = 64, dropout: FloatLike = 0.1, ) -> None: """