Add self-loops to propagate disambiguation symbols.

This commit is contained in:
Fangjun Kuang 2021-07-21 13:12:20 +08:00
parent 8a72901f3a
commit a01d08f73c
2 changed files with 66 additions and 9 deletions

View File

@ -18,15 +18,13 @@ consisting of words and phones and does the following:
lexicon = k2.Fsa.from_dict(d)
5. Generate L_disambig.pt, in k2 format.
6. Generate lexicon_disambig.txt
"""
import math
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
from typing import Any, Dict, List, Tuple
import k2
import torch
@ -90,6 +88,10 @@ def write_lexicon(filename: str, lexicon: Lexicon) -> None:
def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
"""Write a symbol to ID mapping to a file.
Note:
No need to implement `read_mapping` as it can be done
through :func:`k2.SymbolTable.from_file`.
Args:
filename:
Filename to save the mapping.
@ -119,7 +121,7 @@ def get_phones(lexicon: Lexicon) -> List[str]:
return sorted_ans
def get_words(lexicon: List[Tuple[str, List[str]]]) -> List[str]:
def get_words(lexicon: Lexicon) -> List[str]:
"""Get words from a lexicon.
Args:
@ -213,12 +215,46 @@ def generate_id_map(symbols: List[str]) -> Dict[str, int]:
return {sym: i for i, sym in enumerate(symbols)}
def add_self_loops(
arcs: List[List[Any]], disambig_phone: int, disambig_word: int
) -> List[List[Any]]:
"""Adds self-loops to states of an FST to propagate disambiguation symbols
through it. They are added on each state with non-epsilon output symbols
on at least one arc out of the state.
See also fstaddselfloops.pl from Kaldi. One difference is that
Kaldi uses OpenFst style FSTs and it has multiple final states.
This function uses k2 style FSTs and it does not need to add self-loops
to the final state.
Args:
arcs:
A list-of-list. The sublist contains
`[src_state, dest_state, label, aux_label, score]`
Return:
Return new `arcs` that contain self-loops.
"""
states_needs_self_loops = set()
for arc in arcs:
src, dst, ilable, olable, score = arc
if olable != 0:
states_needs_self_loops.add(src)
ans = []
for s in states_needs_self_loops:
ans.append([s, s, disambig_phone, disambig_word, 0])
return arcs + ans
def lexicon_to_fst(
lexicon: Lexicon,
phone2id: Dict[str, int],
word2id: Dict[str, int],
sil_phone: str = "SIL",
sil_prob: float = 0.5,
need_self_loops: bool = False,
) -> k2.Fsa:
"""Convert a lexicon to an FST (in k2 format) with optional silence at
the beginning and end of the word.
@ -235,6 +271,9 @@ def lexicon_to_fst(
sil_prob:
The probability for adding a silence at the beginning and end
of the word.
need_self_loops:
If True, add self-loop to states with non-epsilon output symbols
on at least one arc out of the state.
Returns:
Return an instance of `k2.Fsa` representing the given lexicon.
"""
@ -285,6 +324,15 @@ def lexicon_to_fst(
arcs.append([cur_state, loop_state, prons[i], w, no_sil_score])
arcs.append([cur_state, sil_state, prons[i], w, sil_score])
if need_self_loops:
disambig_phone = phone2id["#0"]
disambig_word = word2id["#0"]
arcs = add_self_loops(
arcs,
disambig_phone=disambig_phone,
disambig_word=disambig_word,
)
final_state = next_state
arcs.append([loop_state, final_state, -1, -1, 0])
arcs.append([final_state])
@ -346,13 +394,10 @@ def main():
word2id=word2id,
sil_phone=sil_phone,
sil_prob=sil_prob,
need_self_loops=True,
)
# TODO(fangjun): add self-loops to L_disambig
# whose ilabel is phone2id['#0'] and olable is word2id['#0']
# Need to implement it in k2
if True:
if False:
# Just for debugging, will remove it
torch.save(L.as_dict(), out_dir / "L.pt")
torch.save(L_disambig.as_dict(), out_dir / "L_disambig.pt")

View File

@ -75,3 +75,15 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
mkdir -p data/fbank
./local/compute_fbank_musan.py
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
echo "Stage 5: Prepare phone based lang"
# TODO: add BPE based lang
mkdir -p data/lang
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
cat - data/lm/librispeech-lexicon.txt |
sort | uniq > data/lang/lexicon.txt
./local/prepare_lang.py
fi