mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-04 06:34:20 +00:00
Fix typos.
This commit is contained in:
parent
5df6040df0
commit
fb63ed627d
@ -29,12 +29,12 @@ from model import Transducer
|
||||
# acoustic frame indexes) and the vertical axis is `u` (representing
|
||||
# BPE tokens of the transcript).
|
||||
#
|
||||
# Beam search is used to find the path that with the
|
||||
# Beam search is used to find the path with the
|
||||
# highest log probabilities.
|
||||
#
|
||||
# It assumes that the maximum number of symbols that can be
|
||||
# It assumes the maximum number of symbols that can be
|
||||
# emitted per frame is 1. You can use `--modified-transducer-prob`
|
||||
# from train.py to train a model that satisfy this assumption.
|
||||
# from `./train.py` to train a model that satisfies this assumption.
|
||||
|
||||
|
||||
# AlignItem is a node in the lattice, where its
|
||||
@ -42,13 +42,13 @@ from model import Transducer
|
||||
# in the lattice.
|
||||
@dataclass
|
||||
class AlignItem:
|
||||
# log prob of this
|
||||
# log prob of this item originating from the start item
|
||||
log_prob: float
|
||||
|
||||
# It contains framewise token alignment
|
||||
ys: List[int]
|
||||
|
||||
# It equals to number of non-zero entries in ys
|
||||
# It equals to the number of non-zero entries in ys
|
||||
pos_u: int
|
||||
|
||||
|
||||
@ -232,13 +232,13 @@ def force_alignment(
|
||||
return ans
|
||||
|
||||
|
||||
def get_word_begin_frame(
|
||||
def get_word_starting_frame(
|
||||
ali: List[int], sp: spm.SentencePieceProcessor
|
||||
) -> List[int]:
|
||||
"""Get the beginning of each word from the given alignments.
|
||||
"""Get the starting frame of each word from the given alignments.
|
||||
|
||||
When a word is encoded into BPE tokens, the first token starts
|
||||
with underscore "_", which can be used to identify the beginning
|
||||
with underscore "_", which can be used to identify the starting frame
|
||||
of a word.
|
||||
|
||||
Args:
|
||||
|
@ -43,7 +43,7 @@ from pathlib import Path
|
||||
|
||||
import sentencepiece as spm
|
||||
import torch
|
||||
from alignment import get_word_begin_frame
|
||||
from alignment import get_word_starting_frame
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
|
||||
from lhotse.dataset.collation import collate_custom_field
|
||||
@ -121,7 +121,7 @@ def main():
|
||||
|
||||
# key: cut.id
|
||||
# value: a list of pairs (word, time_in_second)
|
||||
word_begin_time_dict = {}
|
||||
word_starting_time_dict = {}
|
||||
for batch in dl:
|
||||
supervisions = batch["supervisions"]
|
||||
cuts = supervisions["cut"]
|
||||
@ -135,23 +135,25 @@ def main():
|
||||
(cuts[i].features.num_frames - 1) // 2 - 1
|
||||
) // 2 == token_alignment_length[i]
|
||||
|
||||
word_begin_frame = get_word_begin_frame(
|
||||
word_starting_frame = get_word_starting_frame(
|
||||
token_alignment[i, : token_alignment_length[i]].tolist(), sp=sp
|
||||
)
|
||||
word_begin_time = [
|
||||
word_starting_time = [
|
||||
"{:.2f}".format(i * frame_shift_in_second)
|
||||
for i in word_begin_frame
|
||||
for i in word_starting_frame
|
||||
]
|
||||
|
||||
words = supervisions["text"][i].split()
|
||||
|
||||
assert len(word_begin_frame) == len(words)
|
||||
word_begin_time_dict[cuts[i].id] = list(zip(words, word_begin_time))
|
||||
assert len(word_starting_frame) == len(words)
|
||||
word_starting_time_dict[cuts[i].id] = list(
|
||||
zip(words, word_starting_time)
|
||||
)
|
||||
|
||||
# This is a demo script and we exit here after processing
|
||||
# one batch.
|
||||
# You can find word starting time in the dict "word_begin_time_dict"
|
||||
for cut_id, word_time in word_begin_time_dict.items():
|
||||
# You can find word starting time in the dict "word_starting_time_dict"
|
||||
for cut_id, word_time in word_starting_time_dict.items():
|
||||
print(f"{cut_id}\n{word_time}\n")
|
||||
break
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user