diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py index f0ea12d62..c360d025a 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py @@ -811,13 +811,23 @@ def run(rank, world_size, args): train_cuts = train_cuts.filter(remove_short_and_long_utt) - num_left = len(train_cuts) - num_removed = num_in_total - num_left - removed_percent = num_removed / num_in_total * 100 + try: + num_left = len(train_cuts) + num_removed = num_in_total - num_left + removed_percent = num_removed / num_in_total * 100 - logging.info(f"Before removing short and long utterances: {num_in_total}") - logging.info(f"After removing short and long utterances: {num_left}") - logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)") + logging.info( + f"Before removing short and long utterances: {num_in_total}" + ) + logging.info(f"After removing short and long utterances: {num_left}") + logging.info( + f"Removed {num_removed} utterances ({removed_percent:.5f}%)" + ) + except TypeError as e: + # You can ignore this error as previous versions of Lhotse work fine + # for the above code. In recent versions of Lhotse, it uses + # lazy filter, producing cutsets that don't have the __len__ method + logging.info(str(e)) if params.start_batch > 0 and checkpoints and "sampler" in checkpoints: # We only load the sampler's state dict when it loads a checkpoint diff --git a/egs/librispeech/ASR/transducer_stateless/train.py b/egs/librispeech/ASR/transducer_stateless/train.py index d6827c17c..89f754b20 100755 --- a/egs/librispeech/ASR/transducer_stateless/train.py +++ b/egs/librispeech/ASR/transducer_stateless/train.py @@ -653,13 +653,23 @@ def run(rank, world_size, args): train_cuts = train_cuts.filter(remove_short_and_long_utt) - num_left = len(train_cuts) - num_removed = num_in_total - num_left - removed_percent = num_removed / num_in_total * 100 + try: + num_left = len(train_cuts) + num_removed = num_in_total - num_left + removed_percent = num_removed / num_in_total * 100 - logging.info(f"Before removing short and long utterances: {num_in_total}") - logging.info(f"After removing short and long utterances: {num_left}") - logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)") + logging.info( + f"Before removing short and long utterances: {num_in_total}" + ) + logging.info(f"After removing short and long utterances: {num_left}") + logging.info( + f"Removed {num_removed} utterances ({removed_percent:.5f}%)" + ) + except TypeError as e: + # You can ignore this error as previous versions of Lhotse work fine + # for the above code. In recent versions of Lhotse, it uses + # lazy filter, producing cutsets that don't have the __len__ method + logging.info(str(e)) train_dl = librispeech.train_dataloaders(train_cuts) diff --git a/egs/librispeech/ASR/transducer_stateless2/train.py b/egs/librispeech/ASR/transducer_stateless2/train.py index 2111795ea..8ceffb489 100755 --- a/egs/librispeech/ASR/transducer_stateless2/train.py +++ b/egs/librispeech/ASR/transducer_stateless2/train.py @@ -641,13 +641,23 @@ def run(rank, world_size, args): train_cuts = train_cuts.filter(remove_short_and_long_utt) - num_left = len(train_cuts) - num_removed = num_in_total - num_left - removed_percent = num_removed / num_in_total * 100 + try: + num_left = len(train_cuts) + num_removed = num_in_total - num_left + removed_percent = num_removed / num_in_total * 100 - logging.info(f"Before removing short and long utterances: {num_in_total}") - logging.info(f"After removing short and long utterances: {num_left}") - logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)") + logging.info( + f"Before removing short and long utterances: {num_in_total}" + ) + logging.info(f"After removing short and long utterances: {num_left}") + logging.info( + f"Removed {num_removed} utterances ({removed_percent:.5f}%)" + ) + except TypeError as e: + # You can ignore this error as previous versions of Lhotse work fine + # for the above code. In recent versions of Lhotse, it uses + # lazy filter, producing cutsets that don't have the __len__ method + logging.info(str(e)) train_dl = librispeech.train_dataloaders(train_cuts)