mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
128 lines
3.2 KiB
Python
Executable File
128 lines
3.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
|
|
|
"""
|
|
This script takes as input lang_dir containing lexicon_disambig.txt,
|
|
tokens.txt, and words.txt and generates the following files:
|
|
|
|
- H.fst
|
|
- HL.fst
|
|
|
|
Note that saved files are in OpenFst binary format.
|
|
|
|
Usage:
|
|
|
|
./local/prepare_lang_fst.py \
|
|
--lang-dir ./data/lang_phone \
|
|
--has-silence 1
|
|
|
|
Or
|
|
|
|
./local/prepare_lang_fst.py \
|
|
--lang-dir ./data/lang_bpe_500
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import kaldifst
|
|
|
|
from icefall.ctc import (
|
|
Lexicon,
|
|
add_disambig_self_loops,
|
|
add_one,
|
|
build_standard_ctc_topo,
|
|
make_lexicon_fst_no_silence,
|
|
make_lexicon_fst_with_silence,
|
|
)
|
|
from icefall.utils import str2bool
|
|
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--lang-dir",
|
|
type=str,
|
|
help="""Input and output directory.
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--has-silence",
|
|
type=str2bool,
|
|
default=False,
|
|
help="True if the lexicon has silence.",
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = get_args()
|
|
lang_dir = args.lang_dir
|
|
|
|
lexicon = Lexicon(lang_dir)
|
|
|
|
logging.info("Building standard CTC topology")
|
|
max_token_id = max(lexicon.tokens)
|
|
H = build_standard_ctc_topo(max_token_id=max_token_id)
|
|
|
|
# We need to add one to all tokens since we want to use ID 0
|
|
# for epsilon
|
|
add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
|
|
H.write(f"{lang_dir}/H.fst")
|
|
|
|
logging.info("Building L")
|
|
# Now for HL
|
|
|
|
if args.has_silence:
|
|
L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
|
|
else:
|
|
L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False)
|
|
|
|
if args.has_silence:
|
|
# We also need to change the input labels of L
|
|
add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
|
|
else:
|
|
add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)
|
|
|
|
# Invoke add_disambig_self_loops() so that it eats the disambig symbols
|
|
# from L after composition
|
|
add_disambig_self_loops(
|
|
H,
|
|
start=lexicon.token2id["#0"] + 1,
|
|
end=lexicon.max_disambig_id + 1,
|
|
)
|
|
with open("H_1.fst.txt", "w") as f:
|
|
print(H, file=f)
|
|
|
|
kaldifst.arcsort(H, sort_type="olabel")
|
|
kaldifst.arcsort(L, sort_type="ilabel")
|
|
|
|
logging.info("Building HL")
|
|
HL = kaldifst.compose(H, L)
|
|
kaldifst.determinize_star(HL)
|
|
|
|
disambig0 = lexicon.token2id["#0"] + 1
|
|
max_disambig = lexicon.max_disambig_id + 1
|
|
for state in kaldifst.StateIterator(HL):
|
|
for arc in kaldifst.ArcIterator(HL, state):
|
|
# If treat_ilabel_zero_specially is False, we always change it
|
|
# Otherwise, we only change non-zero input labels
|
|
if disambig0 <= arc.ilabel <= max_disambig:
|
|
arc.ilabel = 0
|
|
|
|
# Note: We are not composing L with G, so there is no need to add
|
|
# self-loops to L to handle #0
|
|
|
|
HL.write(f"{lang_dir}/HL.fst")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
|
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
main()
|