diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
index 66c03d17c..e3008ea69 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
@@ -46,7 +46,6 @@ class Conformer(EncoderInterface):
         dim_feedforward (int): feedforward dimention
         num_encoder_layers (int): number of encoder layers
         dropout (float): dropout rate
-        layer_dropout (float): layer-dropout rate.
         cnn_module_kernel (int): Kernel size of convolution module
         vgg_frontend (bool): whether to use vgg frontend.
         warmup_batches (float): number of batches to warm up over
@@ -403,9 +402,13 @@ class ConformerEncoder(nn.Module):
         super().__init__()
 
         # keep track of how many times forward() has been called, for purposes of
-        # warmup
+        # warmup.  do this with a floating-point count because integer counts can
+        # fail to survive model averaging.
         self.register_buffer('warmup_count', torch.tensor(0.0))
 
+        # if this assert fails, increase the numbers in get_warmup_count().
+        assert warmup_end <= 1000000.0
+
         self.encoder_pos = RelPositionalEncoding(encoder_layer.d_model,
                                                  dropout)
 
@@ -432,10 +435,38 @@ class ConformerEncoder(nn.Module):
         """
         ans = self.warmup_count.item()
         if self.training:
-            self.warmup_count += 1
+            if ans > 1000000.0:
+                # this ensures that as the number of batches gets large, the warmup count cycles rather
+                # than getting stuck at the smallest floating point value x such that x + 1 == x.
+                # this is necessary because get_layers_to_drop() relies on the warmup count changing
+                # on every batch.
+                next_count = 500000.0
+            else:
+                next_count = ans + 1.0
+            self.warmup_count.fill_(next_count)
         return ans
 
 
+    def get_layers_to_drop(self, warmup_count: float):
+        ans = set()
+        if not self.training:
+            return ans
+        # We use a random number generator seeded from warmup_count because
+        # if there are multiple training processes we want them to all drop the
+        # same number of layers (not necessarily the same layers though).  This
+        # will tend to minimize training time.
+        rng = random.Random(int(warmup_count))
+        num_layers = len(self.layers)
+
+        # x is the expected number of layers to drop
+        x = 0.075 * num_layers
+        # integerize x in a way that preserves sxpectations.
+        num_layers_to_drop = int(x) + int(rng.random() < (x - int(x)))
+        while (len(ans) < num_layers_to_drop):
+            # use random, not rng here, because we don't want the same specific layers to be dropped.
+            ans.add(random.randrange(0, num_layers))
+        return ans
+
     def forward(
         self,
         src: Tensor,
@@ -468,9 +499,13 @@ class ConformerEncoder(nn.Module):
         outputs = []
         attn_scores = None
 
+        layers_to_drop = self.get_layers_to_drop(warmup_count)
+
         output = output * feature_mask
 
         for i, mod in enumerate(self.layers):
+            if i in layers_to_drop:
+                continue
             next_output, attn_scores = mod(
                 output,
                 pos_emb,