mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-13 02:54:19 +00:00
68 lines
1.6 KiB
Python
Executable File
68 lines
1.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
|
|
|
"""
|
|
This script takes as input data/lang_phone containing lexicon_disambig.txt,
|
|
tokens.txt, and words.txt and generates the following files:
|
|
|
|
- H.fst
|
|
- HL.fst
|
|
|
|
TODO(fangjun): Generate HLG.fst
|
|
|
|
Note that saved files are in OpenFst binary format.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import kaldifst
|
|
|
|
from icefall.ctc import (
|
|
Lexicon,
|
|
add_disambig_self_loops,
|
|
add_one,
|
|
build_standard_ctc_topo,
|
|
make_lexicon_fst_with_silence,
|
|
)
|
|
|
|
|
|
def main():
|
|
lang_dir = Path("data/lang_phone")
|
|
lexicon = Lexicon(lang_dir)
|
|
|
|
max_token_id = max(lexicon.tokens)
|
|
H = build_standard_ctc_topo(max_token_id=max_token_id)
|
|
|
|
# We need to add one to all tokens since we want to use ID 0
|
|
# for epsilon
|
|
add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
|
|
H.write(f"{lang_dir}/H.fst")
|
|
|
|
# Now for HL
|
|
L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
|
|
|
|
# We also need to change the input labels of L
|
|
add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
|
|
|
|
# Invoke add_disambig_self_loops() so that it eats the disambig symbols
|
|
# from L after composition
|
|
add_disambig_self_loops(
|
|
H,
|
|
start=lexicon.token2id["#0"] + 1,
|
|
end=lexicon.max_disambig_id,
|
|
)
|
|
|
|
kaldifst.arcsort(H, sort_type="olabel")
|
|
kaldifst.arcsort(L, sort_type="ilabel")
|
|
HL = kaldifst.compose(H, L)
|
|
|
|
# Note: We are not composing L with G, so there is no need to add
|
|
# self-loops to L to handle #0
|
|
|
|
HL.write(f"{lang_dir}/HL.fst")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|