icefall/egs/librispeech/ASR/local/prepare_lang_fst.py

#!/usr/bin/env python3

# Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)

"""
This script takes as input lang_dir containing lexicon_disambig.txt,
tokens.txt, and words.txt and generates the following files:

    - H.fst
    - HL.fst

Note that saved files are in OpenFst binary format.

Usage:

./local/prepare_lang_fst.py \
  --lang-dir ./data/lang_phone \
  --has-silence 1

Or

./local/prepare_lang_fst.py \
  --lang-dir ./data/lang_bpe_500
"""

import argparse
import logging
from pathlib import Path

import kaldifst

from icefall.ctc import (
    Lexicon,
    add_disambig_self_loops,
    add_one,
    build_standard_ctc_topo,
    make_lexicon_fst_no_silence,
    make_lexicon_fst_with_silence,
)
from icefall.utils import str2bool


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        """,
    )

    parser.add_argument(
        "--has-silence",
        type=str2bool,
        default=False,
        help="True if the lexicon has silence.",
    )

    return parser.parse_args()


def main():
    args = get_args()
    lang_dir = args.lang_dir

    lexicon = Lexicon(lang_dir)

    logging.info("Building standard CTC topology")
    max_token_id = max(lexicon.tokens)
    H = build_standard_ctc_topo(max_token_id=max_token_id)

    # We need to add one to all tokens since we want to use ID 0
    # for epsilon
    add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
    H.write(f"{lang_dir}/H.fst")

    logging.info("Building L")
    # Now for HL

    if args.has_silence:
        L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
    else:
        L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False)

    if args.has_silence:
        # We also need to change the input labels of L
        add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
    else:
        add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)

    # Invoke add_disambig_self_loops() so that it eats the disambig symbols
    # from L after composition
    add_disambig_self_loops(
        H,
        start=lexicon.token2id["#0"] + 1,
        end=lexicon.max_disambig_id + 1,
    )
    with open("H_1.fst.txt", "w") as f:
        print(H, file=f)

    kaldifst.arcsort(H, sort_type="olabel")
    kaldifst.arcsort(L, sort_type="ilabel")

    logging.info("Building HL")
    HL = kaldifst.compose(H, L)
    kaldifst.determinize_star(HL)

    disambig0 = lexicon.token2id["#0"] + 1
    max_disambig = lexicon.max_disambig_id + 1
    for state in kaldifst.StateIterator(HL):
        for arc in kaldifst.ArcIterator(HL, state):
            # If treat_ilabel_zero_specially is False, we always change it
            # Otherwise, we only change non-zero input labels
            if disambig0 <= arc.ilabel <= max_disambig:
                arc.ilabel = 0

    # Note: We are not composing L with G, so there is no need to add
    # self-loops to L to handle #0

    HL.write(f"{lang_dir}/HL.fst")


if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

    logging.basicConfig(format=formatter, level=logging.INFO)
    main()