fix the CTC zipformer2 training

- too many supervision tokens - change filtering rule to `if (T - 2) < len(tokens): return False` - this prevents inf. from appearing in the CTC loss value
2025-12-11 06:55:27 +00:00 · 2024-08-12 10:45:57 +02:00 · 2024-08-12 10:45:57 +02:00 · d400bc5edf
commit d400bc5edf
parent 3b257dd5ae
1 changed files with 4 additions and 2 deletions
--- a/egs/librispeech/ASR/zipformer/train.py
+++ b/egs/librispeech/ASR/zipformer/train.py
@ -1300,9 +1300,11 @@ def run(rank, world_size, args):
        T = ((c.num_frames - 7) // 2 + 1) // 2
        tokens = sp.encode(c.supervisions[0].text, out_type=str)
-        if T < len(tokens):
+        # For CTC `(T - 2)  < len(tokens)` is needed. otherwise inf. in loss appears.
        # For Transducer `T < len(tokens)` was okay.
        if (T - 2) < len(tokens):
            logging.warning(
-                f"Exclude cut with ID {c.id} from training. "
+                f"Exclude cut with ID {c.id} from training (too many supervision tokens). "
                f"Number of frames (before subsampling): {c.num_frames}. "
                f"Number of frames (after subsampling): {T}. "
                f"Text: {c.supervisions[0].text}. "