mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-07 08:04:18 +00:00
Fixes after review.
This commit is contained in:
parent
2aca0d536c
commit
a7ecf96e42
@ -60,8 +60,11 @@ log "dl_dir: $dl_dir"
|
|||||||
|
|
||||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||||
log "Stage -1: Download LM"
|
log "Stage -1: Download LM"
|
||||||
[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
|
mkdir -p $dl_dir/lm
|
||||||
./local/download_lm.py --out-dir=$dl_dir/lm
|
if [ ! -e $dl_dir/lm/.done ]; then
|
||||||
|
./local/download_lm.py --out-dir=$dl_dir/lm
|
||||||
|
touch $dl_dir/lm/.done
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
@ -91,7 +94,10 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
# We assume that you have downloaded the LibriSpeech corpus
|
# We assume that you have downloaded the LibriSpeech corpus
|
||||||
# to $dl_dir/LibriSpeech
|
# to $dl_dir/LibriSpeech
|
||||||
mkdir -p data/manifests
|
mkdir -p data/manifests
|
||||||
lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests
|
if [ ! -e data/manifests/.librispeech.done ]; then
|
||||||
|
lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests
|
||||||
|
touch data/manifests/.librispeech.done
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
@ -99,19 +105,28 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|||||||
# We assume that you have downloaded the musan corpus
|
# We assume that you have downloaded the musan corpus
|
||||||
# to data/musan
|
# to data/musan
|
||||||
mkdir -p data/manifests
|
mkdir -p data/manifests
|
||||||
lhotse prepare musan $dl_dir/musan data/manifests
|
if [ ! -e data/manifests/.musan.done ]; then
|
||||||
|
lhotse prepare musan $dl_dir/musan data/manifests
|
||||||
|
touch data/manifests/.musan.done
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
log "Stage 3: Compute fbank for librispeech"
|
log "Stage 3: Compute fbank for librispeech"
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
./local/compute_fbank_librispeech.py
|
if [ ! -e data/fbank/.librispeech.done ]; then
|
||||||
|
./local/compute_fbank_librispeech.py
|
||||||
|
touch data/fbank/.librispeech.done
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
log "Stage 4: Compute fbank for musan"
|
log "Stage 4: Compute fbank for musan"
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
./local/compute_fbank_musan.py
|
if [ ! -e data/fbank/.musan.done ]; then
|
||||||
|
./local/compute_fbank_musan.py
|
||||||
|
touch data/fbank/.musan.done
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
|
@ -29,6 +29,9 @@ from model import Transducer
|
|||||||
# acoustic frame indexes) and the vertical axis is `u` (representing
|
# acoustic frame indexes) and the vertical axis is `u` (representing
|
||||||
# BPE tokens of the transcript).
|
# BPE tokens of the transcript).
|
||||||
#
|
#
|
||||||
|
# The notations `t` and `u` are from the paper
|
||||||
|
# https://arxiv.org/pdf/1211.3711.pdf
|
||||||
|
#
|
||||||
# Beam search is used to find the path with the
|
# Beam search is used to find the path with the
|
||||||
# highest log probabilities.
|
# highest log probabilities.
|
||||||
#
|
#
|
||||||
@ -37,12 +40,13 @@ from model import Transducer
|
|||||||
# from `./train.py` to train a model that satisfies this assumption.
|
# from `./train.py` to train a model that satisfies this assumption.
|
||||||
|
|
||||||
|
|
||||||
# AlignItem is a node in the lattice, where its
|
# AlignItem is the ending node of a path originated from the starting node.
|
||||||
# len(ys) equals to `t` and pos_u is the u coordinate
|
# len(ys) equals to `t` and pos_u is the u coordinate
|
||||||
# in the lattice.
|
# in the lattice.
|
||||||
@dataclass
|
@dataclass
|
||||||
class AlignItem:
|
class AlignItem:
|
||||||
# log prob of this item originating from the start item
|
# total log prob of the path that ends at this item.
|
||||||
|
# The path is originated from the starting node.
|
||||||
log_prob: float
|
log_prob: float
|
||||||
|
|
||||||
# It contains framewise token alignment
|
# It contains framewise token alignment
|
||||||
@ -234,7 +238,7 @@ def force_alignment(
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def get_word_starting_frame(
|
def get_word_starting_frames(
|
||||||
ali: List[int], sp: spm.SentencePieceProcessor
|
ali: List[int], sp: spm.SentencePieceProcessor
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
"""Get the starting frame of each word from the given token alignments.
|
"""Get the starting frame of each word from the given token alignments.
|
||||||
|
@ -43,7 +43,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
import torch
|
import torch
|
||||||
from alignment import get_word_starting_frame
|
from alignment import get_word_starting_frames
|
||||||
from lhotse import CutSet, load_manifest
|
from lhotse import CutSet, load_manifest
|
||||||
from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
|
from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
|
||||||
from lhotse.dataset.collation import collate_custom_field
|
from lhotse.dataset.collation import collate_custom_field
|
||||||
@ -135,17 +135,17 @@ def main():
|
|||||||
(cuts[i].features.num_frames - 1) // 2 - 1
|
(cuts[i].features.num_frames - 1) // 2 - 1
|
||||||
) // 2 == token_alignment_length[i]
|
) // 2 == token_alignment_length[i]
|
||||||
|
|
||||||
word_starting_frame = get_word_starting_frame(
|
word_starting_frames = get_word_starting_frames(
|
||||||
token_alignment[i, : token_alignment_length[i]].tolist(), sp=sp
|
token_alignment[i, : token_alignment_length[i]].tolist(), sp=sp
|
||||||
)
|
)
|
||||||
word_starting_time = [
|
word_starting_time = [
|
||||||
"{:.2f}".format(i * frame_shift_in_second)
|
"{:.2f}".format(i * frame_shift_in_second)
|
||||||
for i in word_starting_frame
|
for i in word_starting_frames
|
||||||
]
|
]
|
||||||
|
|
||||||
words = supervisions["text"][i].split()
|
words = supervisions["text"][i].split()
|
||||||
|
|
||||||
assert len(word_starting_frame) == len(words)
|
assert len(word_starting_frames) == len(words)
|
||||||
word_starting_time_dict[cuts[i].id] = list(
|
word_starting_time_dict[cuts[i].id] = list(
|
||||||
zip(words, word_starting_time)
|
zip(words, word_starting_time)
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user