Minor fixes.

2025-12-11 06:55:27 +00:00 · 2022-04-14 12:08:39 +08:00 · 2022-04-14 12:08:39 +08:00 · ec9bbf7352
commit ec9bbf7352
parent 04d4423615
3 changed files with 48 additions and 18 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@ -811,13 +811,23 @@ def run(rank, world_size, args):
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-    num_left = len(train_cuts)
+    try:
-    num_removed = num_in_total - num_left
+        num_left = len(train_cuts)
-    removed_percent = num_removed / num_in_total * 100
+        num_removed = num_in_total - num_left
        removed_percent = num_removed / num_in_total * 100
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
+        logging.info(
-    logging.info(f"After removing short and long utterances: {num_left}")
+            f"Before removing short and long utterances: {num_in_total}"
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
+        )
        logging.info(f"After removing short and long utterances: {num_left}")
        logging.info(
            f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
        )
    except TypeError as e:
        # You can ignore this error as previous versions of Lhotse work fine
        # for the above code. In recent versions of Lhotse, it uses
        # lazy filter, producing cutsets that don't have the __len__  method
        logging.info(str(e))
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
--- a/egs/librispeech/ASR/transducer_stateless/train.py
+++ b/egs/librispeech/ASR/transducer_stateless/train.py
@ -653,13 +653,23 @@ def run(rank, world_size, args):
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-    num_left = len(train_cuts)
+    try:
-    num_removed = num_in_total - num_left
+        num_left = len(train_cuts)
-    removed_percent = num_removed / num_in_total * 100
+        num_removed = num_in_total - num_left
        removed_percent = num_removed / num_in_total * 100
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
+        logging.info(
-    logging.info(f"After removing short and long utterances: {num_left}")
+            f"Before removing short and long utterances: {num_in_total}"
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
+        )
        logging.info(f"After removing short and long utterances: {num_left}")
        logging.info(
            f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
        )
    except TypeError as e:
        # You can ignore this error as previous versions of Lhotse work fine
        # for the above code. In recent versions of Lhotse, it uses
        # lazy filter, producing cutsets that don't have the __len__  method
        logging.info(str(e))
    train_dl = librispeech.train_dataloaders(train_cuts)
--- a/egs/librispeech/ASR/transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/transducer_stateless2/train.py
@ -641,13 +641,23 @@ def run(rank, world_size, args):
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-    num_left = len(train_cuts)
+    try:
-    num_removed = num_in_total - num_left
+        num_left = len(train_cuts)
-    removed_percent = num_removed / num_in_total * 100
+        num_removed = num_in_total - num_left
        removed_percent = num_removed / num_in_total * 100
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
+        logging.info(
-    logging.info(f"After removing short and long utterances: {num_left}")
+            f"Before removing short and long utterances: {num_in_total}"
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
+        )
        logging.info(f"After removing short and long utterances: {num_left}")
        logging.info(
            f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
        )
    except TypeError as e:
        # You can ignore this error as previous versions of Lhotse work fine
        # for the above code. In recent versions of Lhotse, it uses
        # lazy filter, producing cutsets that don't have the __len__  method
        logging.info(str(e))
    train_dl = librispeech.train_dataloaders(train_cuts)