diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py index ad0292ec0..1db816aa7 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py @@ -382,10 +382,13 @@ class ConformerEncoder(nn.Module): ) -> None: super().__init__() - # keep track of how many times forward() has been called, for purposes of - # warmup. do this with a floating-point count because integer counts can - # fail to survive model averaging. - self.register_buffer('warmup_count', torch.tensor(0.0)) + # keep track of how many times forward() has been called, for purposes + # of warmup. do this with a floating-point count because integer counts + # can fail to survive model averaging. initialize with a smallish + # random number so that different encoders use different random seeds in + # shared_rng get_layers_to_drop() while using the same random seeds + # across jobs. + self.register_buffer('warmup_count', torch.tensor(float(10.0 * random.random()))) self.warmup_begin = warmup_begin self.warmup_end = warmup_end @@ -455,13 +458,31 @@ class ConformerEncoder(nn.Module): if not self.training: return ans - rng = random.Random(rnd_seed) + shared_rng = random.Random(int(warmup_count * 1000)) + independent_rng = random.Random(rnd_seed) - for layer in range(num_layers): - if rng.random() < get_layerdrop_prob(layer): + layerdrop_probs = [ get_layerdrop_prob(i) for i in range(num_layers) ] + tot = sum(layerdrop_probs) + # Instead of drawing the samples independently, we first randomly decide + # how many layers to drop out, using the same random number generator between + # jobs so that all jobs drop out the same number (this is for speed). + # Then we use an approximate approach to drop out the individual layers + # with their specified probs while reaching this exact target. + num_to_drop = int(tot) + int(shared_rng.random() < (tot - int(tot))) + + + layers = list(range(num_layers)) + independent_rng.shuffle(layers) + # go through the shuffled layers twice, in case, the first time round, + # we did not drop out the target number of layers. + layers = layers + layers + for layer in layers: + if independent_rng.random() < get_layerdrop_prob(layer): ans.add(layer) - if random.random() < 0.005 or __name__ == "__main__": - logging.info(f"warmup_begin={warmup_begin}, warmup_end={warmup_end}, warmup_count={warmup_count}, layers_to_drop={ans}") + if len(ans) == num_to_drop: + break + if shared_rng.random() < 0.005 or __name__ == "__main__": + logging.info(f"warmup_begin={warmup_begin}, warmup_end={warmup_end}, warmup_count={warmup_count:.1f}, num_to_drop={num_to_drop}, layers_to_drop={ans}") return ans