diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index 27d21c299..0e13dd20f 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -116,7 +116,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): parser.add_argument( "--num-encoder-layers", type=str, - default="4,4,4,4,4,4", + default="2,4,4,4,4,4", help="Number of zipformer encoder layers per stack, comma separated.", ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index b786f5068..42420d1dc 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -1804,6 +1804,7 @@ class Conv2dSubsampling(nn.Module): ) self.convnext2 = nn.Sequential(ConvNeXt(layer3_channels), + ConvNeXt(layer3_channels), ConvNeXt(layer3_channels), BasicNorm(layer3_channels, channel_dim=1))