add ctc-greedy-search with timestamps (#905)

2025-12-11 06:55:27 +00:00 · 2023-02-13 19:45:09 +08:00 · 2023-02-13 19:45:09 +08:00 · 25ee50e27c
commit 25ee50e27c
parent 6a8b649e56
1 changed files with 120 additions and 7 deletions
--- a/egs/librispeech/ASR/conformer_ctc3/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc3/decode.py
@ -92,7 +92,10 @@ from icefall.decode import (
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    convert_timestamp,
    get_texts,
    make_pad_mask,
    parse_bpe_start_end_pairs,
    parse_fsa_timestamps_and_texts,
    setup_logger,
    store_transcripts_and_timestamps,
@ -167,21 +170,24 @@ def get_parser():
        default="ctc-decoding",
        help="""Decoding method.
        Supported values are:
-        - (0) ctc-decoding. Use CTC decoding. It uses a sentence piece
+        - (0) ctc-greedy-search. It uses a sentence piece model,
          i.e., lang_dir/bpe.model, to convert word pieces to words.
          It needs neither a lexicon nor an n-gram LM.
        - (1) ctc-decoding. Use CTC decoding. It uses a sentence piece
          model, i.e., lang_dir/bpe.model, to convert word pieces to words.
          It needs neither a lexicon nor an n-gram LM.
-        - (1) 1best. Extract the best path from the decoding lattice as the
+        - (2) 1best. Extract the best path from the decoding lattice as the
          decoding result.
-        - (2) nbest. Extract n paths from the decoding lattice; the path
+        - (3) nbest. Extract n paths from the decoding lattice; the path
          with the highest score is the decoding result.
-        - (3) nbest-rescoring. Extract n paths from the decoding lattice,
+        - (4) nbest-rescoring. Extract n paths from the decoding lattice,
          rescore them with an n-gram LM (e.g., a 4-gram LM), the path with
          the highest score is the decoding result.
-        - (4) whole-lattice-rescoring. Rescore the decoding lattice with an
+        - (5) whole-lattice-rescoring. Rescore the decoding lattice with an
          n-gram LM (e.g., a 4-gram LM), the best path of rescored lattice
          is the decoding result.
          you have trained an RNN LM using ./rnn_lm/train.py
-        - (5) nbest-oracle. Its WER is the lower bound of any n-best
+        - (6) nbest-oracle. Its WER is the lower bound of any n-best
          rescoring method can achieve. Useful for debugging n-best
          rescoring method.
        """,
@ -269,6 +275,101 @@ def get_decoding_params() -> AttributeDict:
    return params
 def ctc_greedy_search(
    ctc_probs: torch.Tensor,
    nnet_output_lens: torch.Tensor,
    sp: spm.SentencePieceProcessor,
    subsampling_factor: int = 4,
    frame_shift_ms: float = 10,
 ) -> Tuple[List[Tuple[float, float]], List[List[str]]]:
    """Apply CTC greedy search
    Args:
      ctc_probs (torch.Tensor):
        (batch, max_len, feat_dim)
      nnet_output_lens (torch.Tensor):
        (batch, )
      sp:
        The BPE model.
      subsampling_factor:
        The subsampling factor of the model.
      frame_shift_ms:
        Frame shift in milliseconds between two contiguous frames.
    Returns:
      utt_time_pairs:
        A list of pair list. utt_time_pairs[i] is a list of
        (start-time, end-time) pairs for each word in
        utterance-i.
      utt_words:
        A list of str list. utt_words[i] is a word list of utterence-i.
    """
    topk_prob, topk_index = ctc_probs.topk(1, dim=2)  # (B, maxlen, 1)
    topk_index = topk_index.squeeze(2)  # (B, maxlen)
    mask = make_pad_mask(nnet_output_lens)
    topk_index = topk_index.masked_fill_(mask, 0)  # (B, maxlen)
    hyps = [hyp.tolist() for hyp in topk_index]
    def get_first_tokens(tokens: List[int]) -> List[bool]:
        is_first_token = []
        first_tokens = []
        for t in range(len(tokens)):
            if tokens[t] != 0 and (t == 0 or tokens[t - 1] != tokens[t]):
                is_first_token.append(True)
                first_tokens.append(tokens[t])
            else:
                is_first_token.append(False)
        return first_tokens, is_first_token
    utt_time_pairs = []
    utt_words = []
    for utt in range(len(hyps)):
        first_tokens, is_first_token = get_first_tokens(hyps[utt])
        all_tokens = sp.id_to_piece(hyps[utt])
        index_pairs = parse_bpe_start_end_pairs(all_tokens, is_first_token)
        words = sp.decode(first_tokens).split()
        assert len(index_pairs) == len(words), (
            len(index_pairs),
            len(words),
            all_tokens,
        )
        start = convert_timestamp(
            frames=[i[0] for i in index_pairs],
            subsampling_factor=subsampling_factor,
            frame_shift_ms=frame_shift_ms,
        )
        end = convert_timestamp(
            # The duration in frames is (end_frame_index - start_frame_index + 1)
            frames=[i[1] + 1 for i in index_pairs],
            subsampling_factor=subsampling_factor,
            frame_shift_ms=frame_shift_ms,
        )
        utt_time_pairs.append(list(zip(start, end)))
        utt_words.append(words)
    return utt_time_pairs, utt_words
 def remove_duplicates_and_blank(hyp: List[int]) -> Tuple[List[int], List[int]]:
    # modified from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/common.py
    new_hyp: List[int] = []
    time: List[Tuple[int, int]] = []
    cur = 0
    start, end = -1, -1
    while cur < len(hyp):
        if hyp[cur] != 0:
            new_hyp.append(hyp[cur])
            start = cur
        prev = cur
        while cur < len(hyp) and hyp[cur] == hyp[prev]:
            if start != -1:
                end = cur
            cur += 1
        if start != -1 and end != -1:
            time.append((start, end))
            start, end = -1, -1
    return new_hyp, time
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
@ -360,6 +461,17 @@ def decode_one_batch(
    nnet_output = model.get_ctc_output(encoder_out)
    # nnet_output is (N, T, C)
    if params.decoding_method == "ctc-greedy-search":
        timestamps, hyps = ctc_greedy_search(
            ctc_probs=nnet_output,
            nnet_output_lens=encoder_out_lens,
            sp=bpe_model,
            subsampling_factor=params.subsampling_factor,
            frame_shift_ms=params.frame_shift_ms,
        )
        key = "ctc-greedy-search"
        return {key: (hyps, timestamps)}
    supervision_segments = torch.stack(
        (
            supervisions["sequence_idx"],
@ -696,6 +808,7 @@ def main():
    params.update(vars(args))
    assert params.decoding_method in (
        "ctc-greedy-search",
        "ctc-decoding",
        "1best",
        "nbest",
@ -749,7 +862,7 @@ def main():
    params.sos_id = sos_id
    params.eos_id = eos_id
-    if params.decoding_method == "ctc-decoding":
+    if params.decoding_method in ["ctc-decoding", "ctc-greedy-search"]:
        HLG = None
        H = k2.ctc_topo(
            max_token=max_token_id,