icefall/egs/librispeech/ASR/local/prepare_lang_fst.py

#!/usr/bin/env python3

# Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)

"""
This script takes as input lang_dir containing lexicon_disambig.txt,
tokens.txt, and words.txt and generates the following files:

    - H.fst
    - HL.fst
    - HLG.fst

Note that saved files are in OpenFst binary format.

Usage:

./local/prepare_lang_fst.py \
  --lang-dir ./data/lang_phone \
  --has-silence 1

Or

./local/prepare_lang_fst.py \
  --lang-dir ./data/lang_bpe_500
"""

import argparse
import logging
from pathlib import Path

import kaldifst

from icefall.ctc import (
    Lexicon,
    add_disambig_self_loops,
    add_one,
    build_standard_ctc_topo,
    make_lexicon_fst_no_silence,
    make_lexicon_fst_with_silence,
)
from icefall.utils import str2bool


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        """,
    )

    parser.add_argument(
        "--has-silence",
        type=str2bool,
        default=False,
        help="True if the lexicon has silence.",
    )

    parser.add_argument(
        "--ngram-G",
        type=str,
        help="""If not empty, it is the filename of G used to build HLG.
        For instance, --ngram-G=./data/lm/G_3_fst.txt
        """,
    )

    return parser.parse_args()


def build_HL(
    H: kaldifst.StdVectorFst,
    L: kaldifst.StdVectorFst,
    has_silence: bool,
    lexicon: Lexicon,
) -> kaldifst.StdVectorFst:
    if has_silence:
        # We also need to change the input labels of L
        add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
    else:
        add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)

    # Invoke add_disambig_self_loops() so that it eats the disambig symbols
    # from L after composition
    add_disambig_self_loops(
        H,
        start=lexicon.token2id["#0"] + 1,
        end=lexicon.max_disambig_id + 1,
    )

    kaldifst.arcsort(H, sort_type="olabel")
    kaldifst.arcsort(L, sort_type="ilabel")

    HL = kaldifst.compose(H, L)
    kaldifst.determinize_star(HL)

    disambig0 = lexicon.token2id["#0"] + 1
    max_disambig = lexicon.max_disambig_id + 1
    for state in kaldifst.StateIterator(HL):
        for arc in kaldifst.ArcIterator(HL, state):
            # If treat_ilabel_zero_specially is False, we always change it
            # Otherwise, we only change non-zero input labels
            if disambig0 <= arc.ilabel <= max_disambig:
                arc.ilabel = 0

    # Note: We are not composing L with G, so there is no need to add
    # self-loops to L to handle #0

    return HL


def build_HLG(
    H: kaldifst.StdVectorFst,
    L: kaldifst.StdVectorFst,
    G: kaldifst.StdVectorFst,
    has_silence: bool,
    lexicon: Lexicon,
) -> kaldifst.StdVectorFst:
    if has_silence:
        # We also need to change the input labels of L
        add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
    else:
        add_one(L, treat_ilabel_zero_specially=False, update_olabel=False)

    # add-self-loops
    token_disambig0 = lexicon.token2id["#0"] + 1
    word_disambig0 = lexicon.word2id["#0"]

    kaldifst.add_self_loops(L, isyms=[token_disambig0], osyms=[word_disambig0])

    kaldifst.arcsort(L, sort_type="olabel")
    kaldifst.arcsort(G, sort_type="ilabel")
    LG = kaldifst.compose(L, G)
    kaldifst.determinize_star(LG)
    kaldifst.minimize_encoded(LG)

    kaldifst.arcsort(LG, sort_type="ilabel")

    # Invoke add_disambig_self_loops() so that it eats the disambig symbols
    # from L after composition
    add_disambig_self_loops(
        H,
        start=lexicon.token2id["#0"] + 1,
        end=lexicon.max_disambig_id + 1,
    )

    kaldifst.arcsort(H, sort_type="olabel")

    HLG = kaldifst.compose(H, LG)
    kaldifst.determinize_star(HLG)

    disambig0 = lexicon.token2id["#0"] + 1
    max_disambig = lexicon.max_disambig_id + 1
    for state in kaldifst.StateIterator(HLG):
        for arc in kaldifst.ArcIterator(HLG, state):
            # If treat_ilabel_zero_specially is False, we always change it
            # Otherwise, we only change non-zero input labels
            if disambig0 <= arc.ilabel <= max_disambig:
                arc.ilabel = 0
    return HLG


def copy_fst(fst):
    # Please don't use fst.copy()
    return kaldifst.StdVectorFst(fst)


def main():
    args = get_args()
    lang_dir = args.lang_dir

    lexicon = Lexicon(lang_dir)

    logging.info("Building standard CTC topology")
    max_token_id = max(lexicon.tokens)
    H = build_standard_ctc_topo(max_token_id=max_token_id)

    # We need to add one to all tokens since we want to use ID 0
    # for epsilon
    add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
    H.write(f"{lang_dir}/H.fst")

    logging.info("Building L")
    # Now for HL

    if args.has_silence:
        L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
    else:
        L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False)

    logging.info("Building HL")
    HL = build_HL(
        H=copy_fst(H),
        L=copy_fst(L),
        has_silence=args.has_silence,
        lexicon=lexicon,
    )
    HL.write(f"{lang_dir}/HL.fst")

    if not args.ngram_G:
        logging.info("Skip building HLG")
        return

    logging.info("Building HLG")
    with open(args.ngram_G) as f:
        G = kaldifst.compile(
            s=f.read(),
            acceptor=False,
        )

    HLG = build_HLG(H=H, L=L, G=G, has_silence=args.has_silence, lexicon=lexicon)
    HLG.write(f"{lang_dir}/HLG.fst")


if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

    logging.basicConfig(format=formatter, level=logging.INFO)
    main()