icefall/egs/librispeech/ASR/transducer/beam_search.py

# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import torch
from model import Transducer


def greedy_search(model: Transducer, encoder_out: torch.Tensor) -> List[int]:
    """
    Args:
      model:
        An instance of `Transducer`.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3

    # support only batch_size == 1 for now
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    blank_id = model.decoder.blank_id
    device = model.device

    sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1)
    decoder_out, (h, c) = model.decoder(sos)
    T = encoder_out.size(1)
    t = 0
    hyp = []

    sym_per_frame = 0
    sym_per_utt = 0

    max_sym_per_utt = 1000
    max_sym_per_frame = 3

    while t < T and sym_per_utt < max_sym_per_utt:
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :]
        # fmt: on
        logits = model.joiner(current_encoder_out, decoder_out)
        # logits is (1, 1, 1, vocab_size)

        log_prob = logits.log_softmax(dim=-1)
        # log_prob is (1, 1, 1, vocab_size)
        # TODO: Use logits.argmax()
        y = log_prob.argmax()
        if y != blank_id:
            hyp.append(y.item())
            y = y.reshape(1, 1)
            decoder_out, (h, c) = model.decoder(y, (h, c))

            sym_per_utt += 1
            sym_per_frame += 1

        if y == blank_id or sym_per_frame > max_sym_per_frame:
            sym_per_frame = 0
            t += 1

    return hyp


@dataclass
class Hypothesis:
    ys: List[int]  # the predicted sequences so far
    log_prob: float  # The log prob of ys

    # Optional decoder state. We assume it is LSTM for now,
    # so the state is a tuple (h, c)
    decoder_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None


def beam_search(
    model: Transducer,
    encoder_out: torch.Tensor,
    beam: int = 5,
) -> List[int]:
    """
    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf

    espnet/nets/beam_search_transducer.py#L247 is used as a reference.

    Args:
      model:
        An instance of `Transducer`.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
      beam:
        Beam size.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3

    # support only batch_size == 1 for now
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    blank_id = model.decoder.blank_id
    device = model.device

    sos = torch.tensor([blank_id], device=device).reshape(1, 1)
    decoder_out, (h, c) = model.decoder(sos)
    T = encoder_out.size(1)
    t = 0
    B = [Hypothesis(ys=[blank_id], log_prob=0.0, decoder_state=None)]
    max_u = 20000  # terminate after this number of steps
    u = 0

    cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {}

    while t < T and u < max_u:
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :]
        # fmt: on
        A = B
        B = []
        #  for hyp in A:
        #      for h in A:
        #          if h.ys == hyp.ys[:-1]:
        #              # update the score of hyp
        #              decoder_input = torch.tensor(
        #                  [h.ys[-1]], device=device
        #              ).reshape(1, 1)
        #              decoder_out, _ = model.decoder(
        #                  decoder_input, h.decoder_state
        #              )
        #              logits = model.joiner(current_encoder_out, decoder_out)
        #              log_prob = logits.log_softmax(dim=-1)
        #              log_prob = log_prob.squeeze()
        #              hyp.log_prob += h.log_prob + log_prob[hyp.ys[-1]].item()

        while u < max_u:
            y_star = max(A, key=lambda hyp: hyp.log_prob)
            A.remove(y_star)

            # Note: y_star.ys is unhashable, i.e., cannot be used
            # as a key into a dict
            cached_key = "_".join(map(str, y_star.ys))

            if cached_key not in cache:
                decoder_input = torch.tensor([y_star.ys[-1]], device=device).reshape(
                    1, 1
                )

                decoder_out, decoder_state = model.decoder(
                    decoder_input,
                    y_star.decoder_state,
                )
                cache[cached_key] = (decoder_out, decoder_state)
            else:
                decoder_out, decoder_state = cache[cached_key]

            logits = model.joiner(current_encoder_out, decoder_out)
            log_prob = logits.log_softmax(dim=-1)
            # log_prob is (1, 1, 1, vocab_size)
            log_prob = log_prob.squeeze()
            # Now log_prob is (vocab_size,)

            # If we choose blank here, add the new hypothesis to B.
            # Otherwise, add the new hypothesis to A

            # First, choose blank
            skip_log_prob = log_prob[blank_id]
            new_y_star_log_prob = y_star.log_prob + skip_log_prob.item()

            # ys[:] returns a copy of ys
            new_y_star = Hypothesis(
                ys=y_star.ys[:],
                log_prob=new_y_star_log_prob,
                # Caution: Use y_star.decoder_state here
                decoder_state=y_star.decoder_state,
            )
            B.append(new_y_star)

            # Second, choose other labels
            for i, v in enumerate(log_prob.tolist()):
                if i == blank_id:
                    continue
                new_ys = y_star.ys + [i]
                new_log_prob = y_star.log_prob + v
                new_hyp = Hypothesis(
                    ys=new_ys,
                    log_prob=new_log_prob,
                    decoder_state=decoder_state,
                )
                A.append(new_hyp)
            u += 1
            # check whether B contains more than "beam" elements more probable
            # than the most probable in A
            A_most_probable = max(A, key=lambda hyp: hyp.log_prob)
            B = sorted(
                [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob],
                key=lambda hyp: hyp.log_prob,
                reverse=True,
            )
            if len(B) >= beam:
                B = B[:beam]
                break
        t += 1
    best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:]))
    ys = best_hyp.ys[1:]  # [1:] to remove the blank
    return ys