diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
index efed5eb9d..cab0289c0 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
@@ -450,7 +450,13 @@ class ConformerEncoder(nn.Module):
             initial_layerdrop_prob = 0.5
             final_layerdrop_prob = 0.05
 
-            if warmup_count < layer_warmup_begin:
+            if warmup_count < 100.0:
+                # As a special case, if warmup_count < 100.0 return 0 (drop no
+                # layers).  This is rather ugly, I'm afraid; it is intended to
+                # enable our scan_pessimistic_batches_for_oom() code to work correctly
+                # so if we are going to get OOM it will happen early.
+                return 0.0
+            elif warmup_count < layer_warmup_begin:
                 return initial_layerdrop_prob
             elif warmup_count > layer_warmup_end:
                 return final_layerdrop_prob
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
index c4b6677dc..b00074051 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@@ -1172,6 +1172,7 @@ def scan_pessimistic_batches_for_oom(
                 )
             display_and_save_batch(batch, params=params, sp=sp)
             raise
+        logging.info(f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB")
 
 
 def main():