diff --git a/egs/librispeech/ASR/conformer_lm/conformer.py b/egs/librispeech/ASR/conformer_lm/conformer.py
index 1963056cc..fe0a5eec9 100644
--- a/egs/librispeech/ASR/conformer_lm/conformer.py
+++ b/egs/librispeech/ASR/conformer_lm/conformer.py
@@ -21,7 +21,7 @@ class MaskedLmConformer(nn.Module):
         d_model: int = 256,
         nhead: int = 4,
         dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
+        num_encoder_layers: int = 6,
         num_decoder_layers: int = 6,
         dropout: float = 0.1,
         cnn_module_kernel: int = 31,
diff --git a/egs/librispeech/ASR/conformer_lm/train.py b/egs/librispeech/ASR/conformer_lm/train.py
index 66602ea1d..e8a5c8888 100755
--- a/egs/librispeech/ASR/conformer_lm/train.py
+++ b/egs/librispeech/ASR/conformer_lm/train.py
@@ -317,7 +317,7 @@ def compute_validation_loss(
             break
         batch = tuple(x.to(device) for x in batch)
 
-        # `batch` is actually a tuple.. we'll unpack it later.
+
         loss = compute_loss(model, batch, is_training=False)
         num_frames = batch[4].sum()
 
@@ -390,17 +390,23 @@ def train_one_epoch(
         params.batch_idx_train += 1
         batch = tuple(x.to(device) for x in batch)
 
-        loss = compute_loss(
-            model=model,
+        try:
+            loss = compute_loss(
+                model=model,
             batch=batch,
-            is_training=True,
-        )
+                is_training=True,
+            )
+
+            optimizer.zero_grad()
+            loss.backward()
+            # We are not normalizing by the num-frames, but Adam/Madam are insensitive to the total
+            # gradient scale so this should not matter.
+            # clip_grad_norm_(model.parameters(), 5.0, 2.0)
+            optimizer.step()
+        except RuntimeError as e:
+            print(f"Error on batch of shape (N,T) = {batch[0].shape}")
+            raise e
 
-        optimizer.zero_grad()
-        loss.backward()  # We are not normalizing by the num-frames, but Adam/Madam are insensitive to the total
-                         # gradient scale so this should not matter.
-        # clip_grad_norm_(model.parameters(), 5.0, 2.0)
-        optimizer.step()
 
         loss_cpu = loss.detach().cpu().item()
         num_frames_cpu = batch[4].sum().cpu().item()