diff --git a/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py b/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py
index 88fe7ab52..9aca757e7 100755
--- a/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py
+++ b/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py
@@ -173,11 +173,11 @@ def get_params() -> AttributeDict:
             "use_double_scores": True,
             "accum_grad": 1,
             "att_scale": 0.5,
-            "reverse_att_scale": 0.2,
+            "reverse_att_scale": 0.25,
             "ctc_scale": 0.3,
-            "delay_scale": 0.1,    # Scale on difference between current and
+            "delay_scale": 2.5,    # Scale on difference between current and
                                    # delayed version of positive_embed.
-            "delay_minibatches": 200,
+            "delay_minibatches": 300,
             "attention_dim": 512,
             "nhead": 8,
             "num_trunk_encoder_layers": 12,
@@ -460,7 +460,7 @@ def compute_loss(
                 delayed_model = get_delayed_model(model, params)
                 with torch.random.fork_rng(devices=[device], enabled=True):
                     (old_memory, _, _) = delayed_model(feature, supervisions)
-                    (_, _, old_positive_embed, _, _) = delayed_model.sample_forward(old_memory)
+                    (_, old_softmax, _, _, _) = delayed_model.sample_forward(old_memory)
 
 
         with torch.set_grad_enabled(is_training):
@@ -472,7 +472,7 @@ def compute_loss(
              negative_embed_shifted) = mmodel.sample_forward(memory)
 
             if params.cur_epoch > 0 and params.delay_scale > 0.0:
-                delay_loss = compute_distance(old_positive_embed, positive_embed)
+                delay_loss = compute_distance(old_softmax, softmax)
 
             num_subsampled_frames = memory.shape[0] * memory.shape[1]