diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 3b2678ec4..1bbf7bbcf 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -60,8 +60,11 @@ log "dl_dir: $dl_dir" if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then log "Stage -1: Download LM" - [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm - ./local/download_lm.py --out-dir=$dl_dir/lm + mkdir -p $dl_dir/lm + if [ ! -e $dl_dir/lm/.done ]; then + ./local/download_lm.py --out-dir=$dl_dir/lm + touch $dl_dir/lm/.done + fi fi if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then @@ -91,7 +94,10 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then # We assume that you have downloaded the LibriSpeech corpus # to $dl_dir/LibriSpeech mkdir -p data/manifests - lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests + if [ ! -e data/manifests/.librispeech.done ]; then + lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests + touch data/manifests/.librispeech.done + fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then @@ -99,19 +105,28 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then # We assume that you have downloaded the musan corpus # to data/musan mkdir -p data/manifests - lhotse prepare musan $dl_dir/musan data/manifests + if [ ! -e data/manifests/.musan.done ]; then + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan.done + fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Compute fbank for librispeech" mkdir -p data/fbank - ./local/compute_fbank_librispeech.py + if [ ! -e data/fbank/.librispeech.done ]; then + ./local/compute_fbank_librispeech.py + touch data/fbank/.librispeech.done + fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Compute fbank for musan" mkdir -p data/fbank - ./local/compute_fbank_musan.py + if [ ! -e data/fbank/.musan.done ]; then + ./local/compute_fbank_musan.py + touch data/fbank/.musan.done + fi fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then diff --git a/egs/librispeech/ASR/transducer_stateless/alignment.py b/egs/librispeech/ASR/transducer_stateless/alignment.py index c1cd6e3b1..f143611ea 100644 --- a/egs/librispeech/ASR/transducer_stateless/alignment.py +++ b/egs/librispeech/ASR/transducer_stateless/alignment.py @@ -29,6 +29,9 @@ from model import Transducer # acoustic frame indexes) and the vertical axis is `u` (representing # BPE tokens of the transcript). # +# The notations `t` and `u` are from the paper +# https://arxiv.org/pdf/1211.3711.pdf +# # Beam search is used to find the path with the # highest log probabilities. # @@ -37,12 +40,13 @@ from model import Transducer # from `./train.py` to train a model that satisfies this assumption. -# AlignItem is a node in the lattice, where its +# AlignItem is the ending node of a path originated from the starting node. # len(ys) equals to `t` and pos_u is the u coordinate # in the lattice. @dataclass class AlignItem: - # log prob of this item originating from the start item + # total log prob of the path that ends at this item. + # The path is originated from the starting node. log_prob: float # It contains framewise token alignment @@ -234,7 +238,7 @@ def force_alignment( return ans -def get_word_starting_frame( +def get_word_starting_frames( ali: List[int], sp: spm.SentencePieceProcessor ) -> List[int]: """Get the starting frame of each word from the given token alignments. diff --git a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py index fed3c121c..99d5b3788 100755 --- a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py +++ b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py @@ -43,7 +43,7 @@ from pathlib import Path import sentencepiece as spm import torch -from alignment import get_word_starting_frame +from alignment import get_word_starting_frames from lhotse import CutSet, load_manifest from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from lhotse.dataset.collation import collate_custom_field @@ -135,17 +135,17 @@ def main(): (cuts[i].features.num_frames - 1) // 2 - 1 ) // 2 == token_alignment_length[i] - word_starting_frame = get_word_starting_frame( + word_starting_frames = get_word_starting_frames( token_alignment[i, : token_alignment_length[i]].tolist(), sp=sp ) word_starting_time = [ "{:.2f}".format(i * frame_shift_in_second) - for i in word_starting_frame + for i in word_starting_frames ] words = supervisions["text"][i].split() - assert len(word_starting_frame) == len(words) + assert len(word_starting_frames) == len(words) word_starting_time_dict[cuts[i].id] = list( zip(words, word_starting_time) )