From d400bc5edf3a3510d29497b9a7b6b1d1d8eb730d Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Mon, 12 Aug 2024 10:45:57 +0200 Subject: [PATCH] fix the CTC zipformer2 training - too many supervision tokens - change filtering rule to `if (T - 2) < len(tokens): return False` - this prevents inf. from appearing in the CTC loss value --- egs/librispeech/ASR/zipformer/train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py index 9b6f4a93a..3dbcafb6c 100755 --- a/egs/librispeech/ASR/zipformer/train.py +++ b/egs/librispeech/ASR/zipformer/train.py @@ -1300,9 +1300,11 @@ def run(rank, world_size, args): T = ((c.num_frames - 7) // 2 + 1) // 2 tokens = sp.encode(c.supervisions[0].text, out_type=str) - if T < len(tokens): + # For CTC `(T - 2) < len(tokens)` is needed. otherwise inf. in loss appears. + # For Transducer `T < len(tokens)` was okay. + if (T - 2) < len(tokens): logging.warning( - f"Exclude cut with ID {c.id} from training. " + f"Exclude cut with ID {c.id} from training (too many supervision tokens). " f"Number of frames (before subsampling): {c.num_frames}. " f"Number of frames (after subsampling): {T}. " f"Text: {c.supervisions[0].text}. "