icefall/egs/librispeech/ASR/local/prepare_lang_fst.py

220 lines
5.7 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
"""
This script takes as input lang_dir containing lexicon_disambig.txt,
tokens.txt, and words.txt and generates the following files:
- H.fst
- HL.fst
- HLG.fst
Note that saved files are in OpenFst binary format.
Usage:
./local/prepare_lang_fst.py \
--lang-dir ./data/lang_phone \
--has-silence 1
Or
./local/prepare_lang_fst.py \
--lang-dir ./data/lang_bpe_500
"""
import argparse
import logging
from pathlib import Path
import kaldifst
from icefall.ctc import (
Lexicon,
add_disambig_self_loops,
add_one,
build_standard_ctc_topo,
make_lexicon_fst_no_silence,
make_lexicon_fst_with_silence,
)
from icefall.utils import str2bool
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Input and output directory.
""",
)
parser.add_argument(
"--has-silence",
type=str2bool,
default=False,
help="True if the lexicon has silence.",
)
parser.add_argument(
"--ngram-G",
type=str,
help="""If not empty, it is the filename of G used to build HLG.
For instance, --ngram-G=./data/lm/G_3_fst.txt
""",
)
return parser.parse_args()
def build_HL(
H: kaldifst.StdVectorFst,
L: kaldifst.StdVectorFst,
has_silence: bool,
lexicon: Lexicon,
) -> kaldifst.StdVectorFst:
if has_silence:
# We also need to change the input labels of L
add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
else:
add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)
# Invoke add_disambig_self_loops() so that it eats the disambig symbols
# from L after composition
add_disambig_self_loops(
H,
start=lexicon.token2id["#0"] + 1,
end=lexicon.max_disambig_id + 1,
)
kaldifst.arcsort(H, sort_type="olabel")
kaldifst.arcsort(L, sort_type="ilabel")
HL = kaldifst.compose(H, L)
kaldifst.determinize_star(HL)
disambig0 = lexicon.token2id["#0"] + 1
max_disambig = lexicon.max_disambig_id + 1
for state in kaldifst.StateIterator(HL):
for arc in kaldifst.ArcIterator(HL, state):
# If treat_ilabel_zero_specially is False, we always change it
# Otherwise, we only change non-zero input labels
if disambig0 <= arc.ilabel <= max_disambig:
arc.ilabel = 0
# Note: We are not composing L with G, so there is no need to add
# self-loops to L to handle #0
return HL
def build_HLG(
H: kaldifst.StdVectorFst,
L: kaldifst.StdVectorFst,
G: kaldifst.StdVectorFst,
has_silence: bool,
lexicon: Lexicon,
) -> kaldifst.StdVectorFst:
if has_silence:
# We also need to change the input labels of L
add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
else:
add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)
# add-self-loops
token_disambig0 = lexicon.token2id["#0"] + 1
word_disambig0 = lexicon.word2id["#0"]
kaldifst.add_self_loops(L, isyms=[token_disambig0], osyms=[word_disambig0])
kaldifst.arcsort(L, sort_type="olabel")
kaldifst.arcsort(G, sort_type="ilabel")
LG = kaldifst.compose(L, G)
kaldifst.determinize_star(LG)
kaldifst.minimize_encoded(LG)
kaldifst.arcsort(LG, sort_type="ilabel")
# Invoke add_disambig_self_loops() so that it eats the disambig symbols
# from L after composition
add_disambig_self_loops(
H,
start=lexicon.token2id["#0"] + 1,
end=lexicon.max_disambig_id + 1,
)
kaldifst.arcsort(H, sort_type="olabel")
HLG = kaldifst.compose(H, LG)
kaldifst.determinize_star(HLG)
disambig0 = lexicon.token2id["#0"] + 1
max_disambig = lexicon.max_disambig_id + 1
for state in kaldifst.StateIterator(HLG):
for arc in kaldifst.ArcIterator(HLG, state):
# If treat_ilabel_zero_specially is False, we always change it
# Otherwise, we only change non-zero input labels
if disambig0 <= arc.ilabel <= max_disambig:
arc.ilabel = 0
return HLG
def copy_fst(fst):
# Please don't use fst.copy()
return kaldifst.StdVectorFst(fst)
def main():
args = get_args()
lang_dir = args.lang_dir
lexicon = Lexicon(lang_dir)
logging.info("Building standard CTC topology")
max_token_id = max(lexicon.tokens)
H = build_standard_ctc_topo(max_token_id=max_token_id)
# We need to add one to all tokens since we want to use ID 0
# for epsilon
add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
H.write(f"{lang_dir}/H.fst")
logging.info("Building L")
# Now for HL
if args.has_silence:
L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
else:
L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False)
logging.info("Building HL")
HL = build_HL(
H=copy_fst(H),
L=copy_fst(L),
has_silence=args.has_silence,
lexicon=lexicon,
)
HL.write(f"{lang_dir}/HL.fst")
if not args.ngram_G:
logging.info("Skip building HLG")
return
logging.info("Building HLG")
with open(args.ngram_G) as f:
G = kaldifst.compile(
s=f.read(),
acceptor=False,
)
HLG = build_HLG(H=H, L=L, G=G, has_silence=args.has_silence, lexicon=lexicon)
HLG.write(f"{lang_dir}/HLG.fst")
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()