Add changes from master to decode.py, train.py

2025-08-09 01:52:41 +00:00 · 2022-03-24 13:10:54 +08:00 · 2022-03-24 13:10:54 +08:00 · aab72bc2a5
commit aab72bc2a5
parent 5d9dae3064
2 changed files with 36 additions and 10 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
@ -71,6 +71,7 @@ from beam_search import (
    beam_search,
    fast_beam_search,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
 )
 from train import get_params, get_transducer_model
@ -191,7 +192,7 @@ def get_parser():
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
-        default=3,
+        default=1,
        help="""Maximum number of symbols per frame.
        Used only when --decoding_method is greedy_search""",
    )
@ -261,6 +262,24 @@ def decode_one_batch(
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    elif (
        params.decoding_method == "greedy_search"
        and params.max_sym_per_frame == 1
    ):
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    elif params.decoding_method == "modified_beam_search":
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
            beam=params.beam_size,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    else:
        batch_size = encoder_out.size(0)
@ -280,12 +299,6 @@ def decode_one_batch(
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            elif params.decoding_method == "modified_beam_search":
                hyp = modified_beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            else:
                raise ValueError(
                    f"Unsupported decoding method: {params.decoding_method}"
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
@ -398,13 +398,17 @@ def load_checkpoint_if_available(
        "batch_idx_train",
        "best_train_loss",
        "best_valid_loss",
        "cur_batch_idx",
    ]
    for k in keys:
        params[k] = saved_params[k]
    if params.start_batch > 0:
        if "cur_epoch" in saved_params:
            params["start_epoch"] = saved_params["cur_epoch"]
        if "cur_batch_idx" in saved_params:
            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
    return saved_params
@ -762,11 +766,20 @@ def run(rank, world_size, args):
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 20.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        return 1.0 <= c.duration <= 20.0
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
-    if checkpoints and "sampler" in checkpoints:
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
        # saved in the middle of an epoch
        sampler_state_dict = checkpoints["sampler"]
    else:
        sampler_state_dict = None