From d400bc5edf3a3510d29497b9a7b6b1d1d8eb730d Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Mon, 12 Aug 2024 10:45:57 +0200
Subject: [PATCH] fix the CTC zipformer2 training

- too many supervision tokens
- change filtering rule to `if (T - 2) < len(tokens): return False`
- this prevents inf. from appearing in the CTC loss value
---
 egs/librispeech/ASR/zipformer/train.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py
index 9b6f4a93a..3dbcafb6c 100755
--- a/egs/librispeech/ASR/zipformer/train.py
+++ b/egs/librispeech/ASR/zipformer/train.py
@@ -1300,9 +1300,11 @@ def run(rank, world_size, args):
         T = ((c.num_frames - 7) // 2 + 1) // 2
         tokens = sp.encode(c.supervisions[0].text, out_type=str)
 
-        if T < len(tokens):
+        # For CTC `(T - 2)  < len(tokens)` is needed. otherwise inf. in loss appears.
+        # For Transducer `T < len(tokens)` was okay.
+        if (T - 2) < len(tokens):
             logging.warning(
-                f"Exclude cut with ID {c.id} from training. "
+                f"Exclude cut with ID {c.id} from training (too many supervision tokens). "
                 f"Number of frames (before subsampling): {c.num_frames}. "
                 f"Number of frames (after subsampling): {T}. "
                 f"Text: {c.supervisions[0].text}. "