mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
218 lines
7.1 KiB
Python
218 lines
7.1 KiB
Python
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
#
|
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import torch
|
|
from model import Transducer
|
|
|
|
|
|
def greedy_search(model: Transducer, encoder_out: torch.Tensor) -> List[int]:
|
|
"""
|
|
Args:
|
|
model:
|
|
An instance of `Transducer`.
|
|
encoder_out:
|
|
A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
|
|
Returns:
|
|
Return the decoded result.
|
|
"""
|
|
assert encoder_out.ndim == 3
|
|
|
|
# support only batch_size == 1 for now
|
|
assert encoder_out.size(0) == 1, encoder_out.size(0)
|
|
blank_id = model.decoder.blank_id
|
|
device = model.device
|
|
|
|
sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1)
|
|
decoder_out, (h, c) = model.decoder(sos)
|
|
T = encoder_out.size(1)
|
|
t = 0
|
|
hyp = []
|
|
|
|
sym_per_frame = 0
|
|
sym_per_utt = 0
|
|
|
|
max_sym_per_utt = 1000
|
|
max_sym_per_frame = 3
|
|
|
|
while t < T and sym_per_utt < max_sym_per_utt:
|
|
# fmt: off
|
|
current_encoder_out = encoder_out[:, t:t+1, :]
|
|
# fmt: on
|
|
logits = model.joiner(current_encoder_out, decoder_out)
|
|
# logits is (1, 1, 1, vocab_size)
|
|
|
|
log_prob = logits.log_softmax(dim=-1)
|
|
# log_prob is (1, 1, 1, vocab_size)
|
|
# TODO: Use logits.argmax()
|
|
y = log_prob.argmax()
|
|
if y != blank_id:
|
|
hyp.append(y.item())
|
|
y = y.reshape(1, 1)
|
|
decoder_out, (h, c) = model.decoder(y, (h, c))
|
|
|
|
sym_per_utt += 1
|
|
sym_per_frame += 1
|
|
|
|
if y == blank_id or sym_per_frame > max_sym_per_frame:
|
|
sym_per_frame = 0
|
|
t += 1
|
|
|
|
return hyp
|
|
|
|
|
|
@dataclass
|
|
class Hypothesis:
|
|
ys: List[int] # the predicted sequences so far
|
|
log_prob: float # The log prob of ys
|
|
|
|
# Optional decoder state. We assume it is LSTM for now,
|
|
# so the state is a tuple (h, c)
|
|
decoder_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
|
|
|
|
|
|
def beam_search(
|
|
model: Transducer,
|
|
encoder_out: torch.Tensor,
|
|
beam: int = 5,
|
|
) -> List[int]:
|
|
"""
|
|
It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
|
|
|
|
espnet/nets/beam_search_transducer.py#L247 is used as a reference.
|
|
|
|
Args:
|
|
model:
|
|
An instance of `Transducer`.
|
|
encoder_out:
|
|
A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
|
|
beam:
|
|
Beam size.
|
|
Returns:
|
|
Return the decoded result.
|
|
"""
|
|
assert encoder_out.ndim == 3
|
|
|
|
# support only batch_size == 1 for now
|
|
assert encoder_out.size(0) == 1, encoder_out.size(0)
|
|
blank_id = model.decoder.blank_id
|
|
device = model.device
|
|
|
|
sos = torch.tensor([blank_id], device=device).reshape(1, 1)
|
|
decoder_out, (h, c) = model.decoder(sos)
|
|
T = encoder_out.size(1)
|
|
t = 0
|
|
B = [Hypothesis(ys=[blank_id], log_prob=0.0, decoder_state=None)]
|
|
max_u = 20000 # terminate after this number of steps
|
|
u = 0
|
|
|
|
cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {}
|
|
|
|
while t < T and u < max_u:
|
|
# fmt: off
|
|
current_encoder_out = encoder_out[:, t:t+1, :]
|
|
# fmt: on
|
|
A = B
|
|
B = []
|
|
# for hyp in A:
|
|
# for h in A:
|
|
# if h.ys == hyp.ys[:-1]:
|
|
# # update the score of hyp
|
|
# decoder_input = torch.tensor(
|
|
# [h.ys[-1]], device=device
|
|
# ).reshape(1, 1)
|
|
# decoder_out, _ = model.decoder(
|
|
# decoder_input, h.decoder_state
|
|
# )
|
|
# logits = model.joiner(current_encoder_out, decoder_out)
|
|
# log_prob = logits.log_softmax(dim=-1)
|
|
# log_prob = log_prob.squeeze()
|
|
# hyp.log_prob += h.log_prob + log_prob[hyp.ys[-1]].item()
|
|
|
|
while u < max_u:
|
|
y_star = max(A, key=lambda hyp: hyp.log_prob)
|
|
A.remove(y_star)
|
|
|
|
# Note: y_star.ys is unhashable, i.e., cannot be used
|
|
# as a key into a dict
|
|
cached_key = "_".join(map(str, y_star.ys))
|
|
|
|
if cached_key not in cache:
|
|
decoder_input = torch.tensor([y_star.ys[-1]], device=device).reshape(
|
|
1, 1
|
|
)
|
|
|
|
decoder_out, decoder_state = model.decoder(
|
|
decoder_input,
|
|
y_star.decoder_state,
|
|
)
|
|
cache[cached_key] = (decoder_out, decoder_state)
|
|
else:
|
|
decoder_out, decoder_state = cache[cached_key]
|
|
|
|
logits = model.joiner(current_encoder_out, decoder_out)
|
|
log_prob = logits.log_softmax(dim=-1)
|
|
# log_prob is (1, 1, 1, vocab_size)
|
|
log_prob = log_prob.squeeze()
|
|
# Now log_prob is (vocab_size,)
|
|
|
|
# If we choose blank here, add the new hypothesis to B.
|
|
# Otherwise, add the new hypothesis to A
|
|
|
|
# First, choose blank
|
|
skip_log_prob = log_prob[blank_id]
|
|
new_y_star_log_prob = y_star.log_prob + skip_log_prob.item()
|
|
|
|
# ys[:] returns a copy of ys
|
|
new_y_star = Hypothesis(
|
|
ys=y_star.ys[:],
|
|
log_prob=new_y_star_log_prob,
|
|
# Caution: Use y_star.decoder_state here
|
|
decoder_state=y_star.decoder_state,
|
|
)
|
|
B.append(new_y_star)
|
|
|
|
# Second, choose other labels
|
|
for i, v in enumerate(log_prob.tolist()):
|
|
if i == blank_id:
|
|
continue
|
|
new_ys = y_star.ys + [i]
|
|
new_log_prob = y_star.log_prob + v
|
|
new_hyp = Hypothesis(
|
|
ys=new_ys,
|
|
log_prob=new_log_prob,
|
|
decoder_state=decoder_state,
|
|
)
|
|
A.append(new_hyp)
|
|
u += 1
|
|
# check whether B contains more than "beam" elements more probable
|
|
# than the most probable in A
|
|
A_most_probable = max(A, key=lambda hyp: hyp.log_prob)
|
|
B = sorted(
|
|
[hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob],
|
|
key=lambda hyp: hyp.log_prob,
|
|
reverse=True,
|
|
)
|
|
if len(B) >= beam:
|
|
B = B[:beam]
|
|
break
|
|
t += 1
|
|
best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:]))
|
|
ys = best_hyp.ys[1:] # [1:] to remove the blank
|
|
return ys
|