From fb63ed627d185a8d936c45b5570140baee0d24b0 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 7 Mar 2022 16:19:06 +0800 Subject: [PATCH] Fix typos. --- .../ASR/transducer_stateless/alignment.py | 16 +++++++-------- .../transducer_stateless/test_compute_ali.py | 20 ++++++++++--------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/egs/librispeech/ASR/transducer_stateless/alignment.py b/egs/librispeech/ASR/transducer_stateless/alignment.py index 492a6fc51..a1101afe4 100644 --- a/egs/librispeech/ASR/transducer_stateless/alignment.py +++ b/egs/librispeech/ASR/transducer_stateless/alignment.py @@ -29,12 +29,12 @@ from model import Transducer # acoustic frame indexes) and the vertical axis is `u` (representing # BPE tokens of the transcript). # -# Beam search is used to find the path that with the +# Beam search is used to find the path with the # highest log probabilities. # -# It assumes that the maximum number of symbols that can be +# It assumes the maximum number of symbols that can be # emitted per frame is 1. You can use `--modified-transducer-prob` -# from train.py to train a model that satisfy this assumption. +# from `./train.py` to train a model that satisfies this assumption. # AlignItem is a node in the lattice, where its @@ -42,13 +42,13 @@ from model import Transducer # in the lattice. @dataclass class AlignItem: - # log prob of this + # log prob of this item originating from the start item log_prob: float # It contains framewise token alignment ys: List[int] - # It equals to number of non-zero entries in ys + # It equals to the number of non-zero entries in ys pos_u: int @@ -232,13 +232,13 @@ def force_alignment( return ans -def get_word_begin_frame( +def get_word_starting_frame( ali: List[int], sp: spm.SentencePieceProcessor ) -> List[int]: - """Get the beginning of each word from the given alignments. + """Get the starting frame of each word from the given alignments. When a word is encoded into BPE tokens, the first token starts - with underscore "_", which can be used to identify the beginning + with underscore "_", which can be used to identify the starting frame of a word. Args: diff --git a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py index ffb270ae7..fed3c121c 100755 --- a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py +++ b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py @@ -43,7 +43,7 @@ from pathlib import Path import sentencepiece as spm import torch -from alignment import get_word_begin_frame +from alignment import get_word_starting_frame from lhotse import CutSet, load_manifest from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from lhotse.dataset.collation import collate_custom_field @@ -121,7 +121,7 @@ def main(): # key: cut.id # value: a list of pairs (word, time_in_second) - word_begin_time_dict = {} + word_starting_time_dict = {} for batch in dl: supervisions = batch["supervisions"] cuts = supervisions["cut"] @@ -135,23 +135,25 @@ def main(): (cuts[i].features.num_frames - 1) // 2 - 1 ) // 2 == token_alignment_length[i] - word_begin_frame = get_word_begin_frame( + word_starting_frame = get_word_starting_frame( token_alignment[i, : token_alignment_length[i]].tolist(), sp=sp ) - word_begin_time = [ + word_starting_time = [ "{:.2f}".format(i * frame_shift_in_second) - for i in word_begin_frame + for i in word_starting_frame ] words = supervisions["text"][i].split() - assert len(word_begin_frame) == len(words) - word_begin_time_dict[cuts[i].id] = list(zip(words, word_begin_time)) + assert len(word_starting_frame) == len(words) + word_starting_time_dict[cuts[i].id] = list( + zip(words, word_starting_time) + ) # This is a demo script and we exit here after processing # one batch. - # You can find word starting time in the dict "word_begin_time_dict" - for cut_id, word_time in word_begin_time_dict.items(): + # You can find word starting time in the dict "word_starting_time_dict" + for cut_id, word_time in word_starting_time_dict.items(): print(f"{cut_id}\n{word_time}\n") break