From 22617da7256b87dd78f00c12930f7bd76b2c03fe Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 5 Dec 2022 23:39:24 +0800 Subject: [PATCH 1/2] Make dropout a schedule starting at 0.3. --- .../pruned_transducer_stateless7/scaling.py | 10 ++++++++- .../pruned_transducer_stateless7/zipformer.py | 21 ++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py index 1fc46259b..c3a652e8a 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py @@ -1212,7 +1212,15 @@ class TanSwish(torch.nn.Module): return TanSwishFunction.apply(x) - +# Dropout2 is just like normal dropout, except it supports schedules on the dropout rates. +class Dropout2(nn.Module): + def __init__(self, p: FloatLike): + super().__init__() + self.p = p + def forward(self, x: Tensor) -> Tensor: + return torch.nn.functional.dropout(x, + p=float(self.p), + training=self.training) class SwooshLFunction(torch.autograd.Function): """ diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index 038da0136..f01b9e8fc 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -27,6 +27,7 @@ from encoder_interface import EncoderInterface from scaling import ( ActivationBalancer, BasicNorm, + Dropout2, MaxEig, DoubleSwish, SwooshL, @@ -107,11 +108,15 @@ class Zipformer(EncoderInterface): feedforward_dim: Union[int, Tuple[int]] = 1536, cnn_module_kernel: Union[int, Tuple[int]] = 31, pos_dim: int = 192, - dropout: float = 0.1, + dropout: FloatLike = None, # see code below for default warmup_batches: float = 4000.0, ) -> None: super(Zipformer, self).__init__() + if dropout is None: + dropout = ScheduledFloat((0.0, 0.3), + (20000.0, 0.1)) + # this is not the probability of skipping a layer. It is the probability of # dropping out the "skip module" which allows the model to skip groups of # encoder stacks; when it's dropped out like this, it means we are forced @@ -383,7 +388,7 @@ class ZipformerEncoderLayer(nn.Module): pos_head_dim: int, value_head_dim: int, feedforward_dim: int, - dropout: float = 0.1, + dropout: FloatLike = 0.1, cnn_module_kernel: int = 31, # layer_skip_rate will be overwritten to change warmup begin and end times. # treating batch_index == 0.0 specially is just to get scan_pessimistic_batches_for_oom() @@ -948,7 +953,7 @@ class CompactRelPositionalEncoding(torch.nn.Module): """ def __init__( self, embed_dim: int, - dropout_rate: float, + dropout_rate: FloatLike, max_len: int = 1000, length_factor: float = 1.0, ) -> None: @@ -956,7 +961,7 @@ class CompactRelPositionalEncoding(torch.nn.Module): super(CompactRelPositionalEncoding, self).__init__() self.embed_dim = embed_dim assert embed_dim % 2 == 0 - self.dropout = torch.nn.Dropout(dropout_rate) + self.dropout = Dropout2(dropout_rate) self.pe = None assert length_factor >= 1.0 self.length_factor = length_factor @@ -1415,7 +1420,7 @@ class FeedforwardModule(nn.Module): def __init__(self, embed_dim: int, feedforward_dim: int, - dropout: float): + dropout: FloatLike): super(FeedforwardModule, self).__init__() self.in_proj = LinearWithAuxLoss(embed_dim, feedforward_dim, aux_grad_scale=_aux_grad_scale(), prob=_aux_grad_prob_in()) @@ -1428,7 +1433,7 @@ class FeedforwardModule(nn.Module): max_abs=5.0, min_prob=0.25) self.activation = SwooshL() - self.dropout = nn.Dropout(dropout) + self.dropout = Dropout2(dropout) self.out_proj = LinearWithAuxLoss(feedforward_dim, embed_dim, initial_scale=0.01, aux_grad_scale=_aux_grad_scale(), prob=_aux_grad_prob_out()) @@ -1684,7 +1689,7 @@ class Conv2dSubsampling(nn.Module): layer2_channels: int = 32, layer3_channels: int = 128, bottleneck_channels: int = 64, - dropout: float = 0.1, + dropout: FloatLike = 0.1, ) -> None: """ Args: @@ -1742,7 +1747,7 @@ class Conv2dSubsampling(nn.Module): self.out = LinearWithAuxLoss(out_height * layer3_channels, out_channels, aux_grad_scale=_aux_grad_scale(), prob=_aux_grad_prob_out()) - self.dropout = nn.Dropout(dropout) + self.dropout = Dropout2(dropout) def forward(self, x: torch.Tensor) -> torch.Tensor: From 63e881f89b7874299cebb798060f385c77140c95 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 5 Dec 2022 23:49:16 +0800 Subject: [PATCH 2/2] Pass in dropout from train.py --- egs/librispeech/ASR/pruned_transducer_stateless7/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index b801beccf..3a75a3d5a 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -60,6 +60,7 @@ import torch.multiprocessing as mp import torch.nn as nn from asr_datamodule import LibriSpeechAsrDataModule from zipformer import Zipformer +from scaling import ScheduledFloat from decoder import Decoder from joiner import Joiner from lhotse.cut import Cut @@ -498,7 +499,7 @@ def get_encoder_model(params: AttributeDict) -> nn.Module: attention_share_layers=to_int_tuple(params.attention_share_layers), feedforward_dim=to_int_tuple(params.feedforward_dim), cnn_module_kernel=to_int_tuple(params.cnn_module_kernel), - dropout=0.1, + dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)), warmup_batches=4000.0, ) return encoder