icefall/egs/librispeech/ASR/local/prepare_lang_fst.py

128 lines
3.2 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
"""
This script takes as input lang_dir containing lexicon_disambig.txt,
tokens.txt, and words.txt and generates the following files:
- H.fst
- HL.fst
Note that saved files are in OpenFst binary format.
Usage:
./local/prepare_lang_fst.py \
--lang-dir ./data/lang_phone \
--has-silence 1
Or
./local/prepare_lang_fst.py \
--lang-dir ./data/lang_bpe_500
"""
import argparse
import logging
from pathlib import Path
import kaldifst
from icefall.ctc import (
Lexicon,
add_disambig_self_loops,
add_one,
build_standard_ctc_topo,
make_lexicon_fst_no_silence,
make_lexicon_fst_with_silence,
)
from icefall.utils import str2bool
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Input and output directory.
""",
)
parser.add_argument(
"--has-silence",
type=str2bool,
default=False,
help="True if the lexicon has silence.",
)
return parser.parse_args()
def main():
args = get_args()
lang_dir = args.lang_dir
lexicon = Lexicon(lang_dir)
logging.info("Building standard CTC topology")
max_token_id = max(lexicon.tokens)
H = build_standard_ctc_topo(max_token_id=max_token_id)
# We need to add one to all tokens since we want to use ID 0
# for epsilon
add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
H.write(f"{lang_dir}/H.fst")
logging.info("Building L")
# Now for HL
if args.has_silence:
L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
else:
L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False)
if args.has_silence:
# We also need to change the input labels of L
add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
else:
add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)
# Invoke add_disambig_self_loops() so that it eats the disambig symbols
# from L after composition
add_disambig_self_loops(
H,
start=lexicon.token2id["#0"] + 1,
end=lexicon.max_disambig_id + 1,
)
with open("H_1.fst.txt", "w") as f:
print(H, file=f)
kaldifst.arcsort(H, sort_type="olabel")
kaldifst.arcsort(L, sort_type="ilabel")
logging.info("Building HL")
HL = kaldifst.compose(H, L)
kaldifst.determinize_star(HL)
disambig0 = lexicon.token2id["#0"] + 1
max_disambig = lexicon.max_disambig_id + 1
for state in kaldifst.StateIterator(HL):
for arc in kaldifst.ArcIterator(HL, state):
# If treat_ilabel_zero_specially is False, we always change it
# Otherwise, we only change non-zero input labels
if disambig0 <= arc.ilabel <= max_disambig:
arc.ilabel = 0
# Note: We are not composing L with G, so there is no need to add
# self-loops to L to handle #0
HL.write(f"{lang_dir}/HL.fst")
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()