diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py index efed5eb9d..cab0289c0 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py @@ -450,7 +450,13 @@ class ConformerEncoder(nn.Module): initial_layerdrop_prob = 0.5 final_layerdrop_prob = 0.05 - if warmup_count < layer_warmup_begin: + if warmup_count < 100.0: + # As a special case, if warmup_count < 100.0 return 0 (drop no + # layers). This is rather ugly, I'm afraid; it is intended to + # enable our scan_pessimistic_batches_for_oom() code to work correctly + # so if we are going to get OOM it will happen early. + return 0.0 + elif warmup_count < layer_warmup_begin: return initial_layerdrop_prob elif warmup_count > layer_warmup_end: return final_layerdrop_prob diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index c4b6677dc..b00074051 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -1172,6 +1172,7 @@ def scan_pessimistic_batches_for_oom( ) display_and_save_batch(batch, params=params, sp=sp) raise + logging.info(f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB") def main():