mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
220 lines
5.7 KiB
Python
Executable File
220 lines
5.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
|
|
|
"""
|
|
This script takes as input lang_dir containing lexicon_disambig.txt,
|
|
tokens.txt, and words.txt and generates the following files:
|
|
|
|
- H.fst
|
|
- HL.fst
|
|
- HLG.fst
|
|
|
|
Note that saved files are in OpenFst binary format.
|
|
|
|
Usage:
|
|
|
|
./local/prepare_lang_fst.py \
|
|
--lang-dir ./data/lang_phone \
|
|
--has-silence 1
|
|
|
|
Or
|
|
|
|
./local/prepare_lang_fst.py \
|
|
--lang-dir ./data/lang_bpe_500
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import kaldifst
|
|
|
|
from icefall.ctc import (
|
|
Lexicon,
|
|
add_disambig_self_loops,
|
|
add_one,
|
|
build_standard_ctc_topo,
|
|
make_lexicon_fst_no_silence,
|
|
make_lexicon_fst_with_silence,
|
|
)
|
|
from icefall.utils import str2bool
|
|
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--lang-dir",
|
|
type=str,
|
|
help="""Input and output directory.
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--has-silence",
|
|
type=str2bool,
|
|
default=False,
|
|
help="True if the lexicon has silence.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--ngram-G",
|
|
type=str,
|
|
help="""If not empty, it is the filename of G used to build HLG.
|
|
For instance, --ngram-G=./data/lm/G_3_fst.txt
|
|
""",
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def build_HL(
|
|
H: kaldifst.StdVectorFst,
|
|
L: kaldifst.StdVectorFst,
|
|
has_silence: bool,
|
|
lexicon: Lexicon,
|
|
) -> kaldifst.StdVectorFst:
|
|
if has_silence:
|
|
# We also need to change the input labels of L
|
|
add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
|
|
else:
|
|
add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)
|
|
|
|
# Invoke add_disambig_self_loops() so that it eats the disambig symbols
|
|
# from L after composition
|
|
add_disambig_self_loops(
|
|
H,
|
|
start=lexicon.token2id["#0"] + 1,
|
|
end=lexicon.max_disambig_id + 1,
|
|
)
|
|
|
|
kaldifst.arcsort(H, sort_type="olabel")
|
|
kaldifst.arcsort(L, sort_type="ilabel")
|
|
|
|
HL = kaldifst.compose(H, L)
|
|
kaldifst.determinize_star(HL)
|
|
|
|
disambig0 = lexicon.token2id["#0"] + 1
|
|
max_disambig = lexicon.max_disambig_id + 1
|
|
for state in kaldifst.StateIterator(HL):
|
|
for arc in kaldifst.ArcIterator(HL, state):
|
|
# If treat_ilabel_zero_specially is False, we always change it
|
|
# Otherwise, we only change non-zero input labels
|
|
if disambig0 <= arc.ilabel <= max_disambig:
|
|
arc.ilabel = 0
|
|
|
|
# Note: We are not composing L with G, so there is no need to add
|
|
# self-loops to L to handle #0
|
|
|
|
return HL
|
|
|
|
|
|
def build_HLG(
|
|
H: kaldifst.StdVectorFst,
|
|
L: kaldifst.StdVectorFst,
|
|
G: kaldifst.StdVectorFst,
|
|
has_silence: bool,
|
|
lexicon: Lexicon,
|
|
) -> kaldifst.StdVectorFst:
|
|
if has_silence:
|
|
# We also need to change the input labels of L
|
|
add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
|
|
else:
|
|
add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)
|
|
|
|
# add-self-loops
|
|
token_disambig0 = lexicon.token2id["#0"] + 1
|
|
word_disambig0 = lexicon.word2id["#0"]
|
|
|
|
kaldifst.add_self_loops(L, isyms=[token_disambig0], osyms=[word_disambig0])
|
|
|
|
kaldifst.arcsort(L, sort_type="olabel")
|
|
kaldifst.arcsort(G, sort_type="ilabel")
|
|
LG = kaldifst.compose(L, G)
|
|
kaldifst.determinize_star(LG)
|
|
kaldifst.minimize_encoded(LG)
|
|
|
|
kaldifst.arcsort(LG, sort_type="ilabel")
|
|
|
|
# Invoke add_disambig_self_loops() so that it eats the disambig symbols
|
|
# from L after composition
|
|
add_disambig_self_loops(
|
|
H,
|
|
start=lexicon.token2id["#0"] + 1,
|
|
end=lexicon.max_disambig_id + 1,
|
|
)
|
|
|
|
kaldifst.arcsort(H, sort_type="olabel")
|
|
|
|
HLG = kaldifst.compose(H, LG)
|
|
kaldifst.determinize_star(HLG)
|
|
|
|
disambig0 = lexicon.token2id["#0"] + 1
|
|
max_disambig = lexicon.max_disambig_id + 1
|
|
for state in kaldifst.StateIterator(HLG):
|
|
for arc in kaldifst.ArcIterator(HLG, state):
|
|
# If treat_ilabel_zero_specially is False, we always change it
|
|
# Otherwise, we only change non-zero input labels
|
|
if disambig0 <= arc.ilabel <= max_disambig:
|
|
arc.ilabel = 0
|
|
return HLG
|
|
|
|
|
|
def copy_fst(fst):
|
|
# Please don't use fst.copy()
|
|
return kaldifst.StdVectorFst(fst)
|
|
|
|
|
|
def main():
|
|
args = get_args()
|
|
lang_dir = args.lang_dir
|
|
|
|
lexicon = Lexicon(lang_dir)
|
|
|
|
logging.info("Building standard CTC topology")
|
|
max_token_id = max(lexicon.tokens)
|
|
H = build_standard_ctc_topo(max_token_id=max_token_id)
|
|
|
|
# We need to add one to all tokens since we want to use ID 0
|
|
# for epsilon
|
|
add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
|
|
H.write(f"{lang_dir}/H.fst")
|
|
|
|
logging.info("Building L")
|
|
# Now for HL
|
|
|
|
if args.has_silence:
|
|
L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
|
|
else:
|
|
L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False)
|
|
|
|
logging.info("Building HL")
|
|
HL = build_HL(
|
|
H=copy_fst(H),
|
|
L=copy_fst(L),
|
|
has_silence=args.has_silence,
|
|
lexicon=lexicon,
|
|
)
|
|
HL.write(f"{lang_dir}/HL.fst")
|
|
|
|
if not args.ngram_G:
|
|
logging.info("Skip building HLG")
|
|
return
|
|
|
|
logging.info("Building HLG")
|
|
with open(args.ngram_G) as f:
|
|
G = kaldifst.compile(
|
|
s=f.read(),
|
|
acceptor=False,
|
|
)
|
|
|
|
HLG = build_HLG(H=H, L=L, G=G, has_silence=args.has_silence, lexicon=lexicon)
|
|
HLG.write(f"{lang_dir}/HLG.fst")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
|
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
main()
|