Support CTC decoding on CPU using OpenFST and kaldi decoders.

2025-12-11 06:55:27 +00:00 · 2023-09-07 22:07:03 +08:00 · 2023-09-07 22:07:03 +08:00 · 2d7067753e
commit 2d7067753e
parent 49a4b67288
20 changed files with 1196 additions and 3 deletions
--- a/.flake8
+++ b/.flake8
@ -24,6 +24,7 @@ exclude =
  **/data/**,
  icefall/shared/make_kn_lm.py,
  icefall/__init__.py
  icefall/ctc/__init__.py
 ignore =
  # E203 white space before ":"
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@ -140,9 +140,46 @@ jobs:
            download/waves_yesno/0_0_0_1_0_0_0_1.wav \
            download/waves_yesno/0_0_1_0_0_0_1_0.wav
      - name: Test decoding with H
        shell: bash
        working-directory: ${{github.workspace}}
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          echo $PYTHONPATH
          cd egs/yesno/ASR
          python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
          python3 ./tdnn/jit_pretrained_decode_with_H.py \
              --nn-model ./tdnn/exp/cpu_jit.pt \
              --H ./data/lang_phone/H.fst \
              --tokens ./data/lang_phone/tokens.txt \
              ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
              ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
              ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
      - name: Test decoding with HL
        shell: bash
        working-directory: ${{github.workspace}}
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          echo $PYTHONPATH
          cd egs/yesno/ASR
          python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
          python3 ./tdnn/jit_pretrained_decode_with_HL.py \
              --nn-model ./tdnn/exp/cpu_jit.pt \
              --HL ./data/lang_phone/HL.fst \
              --words ./data/lang_phone/words.txt \
              ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
              ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
              ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
      - name: Show generated files
        shell: bash
        working-directory: ${{github.workspace}}
        run: |
          cd egs/yesno/ASR
          ls -lh tdnn/exp
          ls -lh data/lang_phone
--- a/.gitignore
+++ b/.gitignore
@ -34,3 +34,5 @@ node_modules
 *.param
 *.bin
 .DS_Store
 *.fst
 *.arpa
--- a/docs/source/model-export/export-ncnn.rst
+++ b/docs/source/model-export/export-ncnn.rst
@ -1,3 +1,5 @@
 .. _icefall_export_to_ncnn:
 Export to ncnn
 ==============
--- a/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/test_model.py
@ -57,8 +57,7 @@ def test_model():
    convert_scaled_to_non_scaled(model, inplace=True)
-    if not os.path.exists(params.exp_dir):
+    params.exp_dir.mkdir(exist_ok=True)
        os.path.mkdir(params.exp_dir)
    encoder_filename = params.exp_dir / "encoder_jit_trace.pt"
    export_encoder_model_jit_trace(model.encoder, encoder_filename)
--- a/egs/yesno/ASR/local/prepare_lang_fst.py
+++ b/egs/yesno/ASR/local/prepare_lang_fst.py
@ -0,0 +1,67 @@
 #!/usr/bin/env python3
 # Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
 """
 This script takes as input data/lang_phone containing lexicon_disambig.txt,
 tokens.txt, and words.txt and generates the following files:
    - H.fst
    - HL.fst
 TODO(fangjun): Generate HLG.fst
 Note that saved files are in OpenFst binary format.
 """
 from pathlib import Path
 import kaldifst
 from icefall.ctc import (
    Lexicon,
    add_disambig_self_loops,
    add_one,
    build_standard_ctc_topo,
    make_lexicon_fst_with_silence,
 )
 def main():
    lang_dir = Path("data/lang_phone")
    lexicon = Lexicon(lang_dir)
    max_token_id = max(lexicon.tokens)
    H = build_standard_ctc_topo(max_token_id=max_token_id)
    # We need to add one to all tokens since we want to use ID 0
    # for epsilon
    add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
    H.write(f"{lang_dir}/H.fst")
    # Now for HL
    L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False)
    # We also need to change the input labels of L
    add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
    # Invoke add_disambig_self_loops() so that it eats the disambig symbols
    # from L after composition
    add_disambig_self_loops(
        H,
        start=lexicon.token2id["#0"] + 1,
        end=lexicon.max_disambig_id,
    )
    kaldifst.arcsort(H, sort_type="olabel")
    kaldifst.arcsort(L, sort_type="ilabel")
    HL = kaldifst.compose(H, L)
    # Note: We are not composing L with G, so there is no need to add
    # self-loops to L to handle #0
    HL.write(f"{lang_dir}/HL.fst")
 if __name__ == "__main__":
    main()
--- a/egs/yesno/ASR/prepare.sh
+++ b/egs/yesno/ASR/prepare.sh
@ -60,6 +60,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  ) > $lang_dir/lexicon.txt
  ./local/prepare_lang.py
  ./local/prepare_lang_fst.py
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
--- a/egs/yesno/ASR/tdnn/jit_pretrained.py
+++ b/egs/yesno/ASR/tdnn/jit_pretrained.py
@ -156,7 +156,6 @@ def main():
    features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
    # Note: We don't use key padding mask for attention during decoding
    nnet_output = model(features)
    batch_size = nnet_output.shape[0]
--- a/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_H.py
+++ b/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_H.py
@ -0,0 +1,210 @@
 #!/usr/bin/env python3
 # Copyright      2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 """
 This file shows how to use a torchscript model for decoding with H
 on CPU using OpenFST and decoders from kaldi.
 Usage:
  ./tdnn/jit_pretrained_decode_with_H.py \
    --nn-model ./tdnn/exp/cpu_jit.pt \
    --H ./data/lang_phone/H.fst \
    --tokens ./data/lang_phone/tokens.txt \
    ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
    ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
    ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
 Note that to generate ./tdnn/exp/cpu_jit.pt,
 you can use ./export.py --jit 1
 """
 import argparse
 import logging
 import math
 from typing import Dict, List
 import kaldifeat
 import kaldifst
 import torch
 import torchaudio
 from kaldi_hmm_gmm import FasterDecoder, FasterDecoderOptions
 from torch.nn.utils.rnn import pad_sequence
 from icefall.ctc import CtcDecodable
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--nn-model",
        type=str,
        required=True,
        help="""Path to the torchscript model.
        You can use ./tdnn/export.py --jit 1
        to obtain it
        """,
    )
    parser.add_argument(
        "--tokens",
        type=str,
        required=True,
        help="Path to tokens.txt",
    )
    parser.add_argument("--H", type=str, required=True, help="Path to H.fst")
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. ",
    )
    return parser
 def read_tokens(tokens_txt: str) -> Dict[int, str]:
    id2token = dict()
    with open(tokens_txt, encoding="utf-8") as f:
        for line in f:
            token, idx = line.strip().split()
            id2token[int(idx)] = token
    return id2token
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        if sample_rate != expected_sample_rate:
            wave = torchaudio.functional.resample(
                wave,
                orig_freq=sample_rate,
                new_freq=expected_sample_rate,
            )
        # We use only the first channel
        ans.append(wave[0].contiguous())
    return ans
 def decode(
    filename: str,
    nnet_output: torch.Tensor,
    H: kaldifst,
    id2token: Dict[int, str],
 ) -> List[str]:
    decodable = CtcDecodable(nnet_output)
    decoder_opts = FasterDecoderOptions()
    decoder = FasterDecoder(H, decoder_opts)
    decoder.decode(decodable)
    if not decoder.reached_final():
        print(f"failed to decode {filename}")
        return ""
    ok, best_path = decoder.get_best_path()
    (
        ok,
        isymbols_out,
        osymbols_out,
        total_weight,
    ) = kaldifst.get_linear_symbol_sequence(best_path)
    if not ok:
        print(f"failed to get linear symbol sequence for {filename}")
        return ""
    # are shifted by 1 during graph construction
    hyps = [id2token[i - 1] for i in osymbols_out if id2token[i - 1] != "SIL"]
    return hyps
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
    device = torch.device("cpu")
    logging.info(f"device: {device}")
    logging.info("Loading torchscript model")
    model = torch.jit.load(args.nn_model)
    model.eval()
    model.to(device)
    logging.info(f"Loading H from {args.H}")
    H = kaldifst.StdVectorFst.read(args.H)
    sample_rate = 8000
    logging.info("Constructing Fbank computer")
    opts = kaldifeat.FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = sample_rate
    opts.mel_opts.num_bins = 23
    fbank = kaldifeat.Fbank(opts)
    logging.info(f"Reading sound files: {args.sound_files}")
    waves = read_sound_files(
        filenames=args.sound_files, expected_sample_rate=sample_rate
    )
    waves = [w.to(device) for w in waves]
    logging.info("Decoding started")
    features = fbank(waves)
    features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
    nnet_output = model(features)
    id2token = read_tokens(args.tokens)
    hyps = []
    for i in range(nnet_output.shape[0]):
        hyp = decode(
            filename=args.sound_files[0],
            nnet_output=nnet_output[i],
            H=H,
            id2token=id2token,
        )
        hyps.append(hyp)
    s = "\n"
    for filename, hyp in zip(args.sound_files, hyps):
        words = " ".join(hyp)
        s += f"{filename}:\n{words}\n\n"
    logging.info(s)
    logging.info("Decoding Done")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_HL.py
+++ b/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_HL.py
@ -0,0 +1,209 @@
 #!/usr/bin/env python3
 # Copyright      2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 """
 This file shows how to use a torchscript model for decoding with HL
 on CPU using OpenFST and decoders from kaldi.
 Usage:
  ./tdnn/jit_pretrained_decode_with_HL.py \
    --nn-model ./tdnn/exp/cpu_jit.pt \
    --HL ./data/lang_phone/HL.fst \
    --words ./data/lang_phone/words.txt \
    ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
    ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
    ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
 Note that to generate ./tdnn/exp/cpu_jit.pt,
 you can use ./export.py --jit 1
 """
 import argparse
 import logging
 import math
 from typing import Dict, List
 import kaldifeat
 import kaldifst
 import torch
 import torchaudio
 from kaldi_hmm_gmm import FasterDecoder, FasterDecoderOptions
 from torch.nn.utils.rnn import pad_sequence
 from icefall.ctc import CtcDecodable
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--nn-model",
        type=str,
        required=True,
        help="""Path to the torchscript model.
        You can use ./tdnn/export.py --jit 1
        to obtain it
        """,
    )
    parser.add_argument(
        "--words",
        type=str,
        required=True,
        help="Path to words.txt",
    )
    parser.add_argument("--HL", type=str, required=True, help="Path to HL.fst")
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. ",
    )
    return parser
 def read_words(words_txt: str) -> Dict[int, str]:
    id2word = dict()
    with open(words_txt, encoding="utf-8") as f:
        for line in f:
            word, idx = line.strip().split()
            id2word[int(idx)] = word
    return id2word
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        if sample_rate != expected_sample_rate:
            wave = torchaudio.functional.resample(
                wave,
                orig_freq=sample_rate,
                new_freq=expected_sample_rate,
            )
        # We use only the first channel
        ans.append(wave[0].contiguous())
    return ans
 def decode(
    filename: str,
    nnet_output: torch.Tensor,
    HL: kaldifst,
    id2word: Dict[int, str],
 ) -> List[str]:
    decodable = CtcDecodable(nnet_output)
    decoder_opts = FasterDecoderOptions()
    decoder = FasterDecoder(HL, decoder_opts)
    decoder.decode(decodable)
    if not decoder.reached_final():
        print(f"failed to decode {filename}")
        return ""
    ok, best_path = decoder.get_best_path()
    (
        ok,
        isymbols_out,
        osymbols_out,
        total_weight,
    ) = kaldifst.get_linear_symbol_sequence(best_path)
    if not ok:
        print(f"failed to get linear symbol sequence for {filename}")
        return ""
    hyps = [id2word[i] for i in osymbols_out if id2word[i] != "<SIL>"]
    return hyps
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
    device = torch.device("cpu")
    logging.info(f"device: {device}")
    logging.info("Loading torchscript model")
    model = torch.jit.load(args.nn_model)
    model.eval()
    model.to(device)
    logging.info(f"Loading HL from {args.HL}")
    HL = kaldifst.StdVectorFst.read(args.HL)
    sample_rate = 8000
    logging.info("Constructing Fbank computer")
    opts = kaldifeat.FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = sample_rate
    opts.mel_opts.num_bins = 23
    fbank = kaldifeat.Fbank(opts)
    logging.info(f"Reading sound files: {args.sound_files}")
    waves = read_sound_files(
        filenames=args.sound_files, expected_sample_rate=sample_rate
    )
    waves = [w.to(device) for w in waves]
    logging.info("Decoding started")
    features = fbank(waves)
    features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
    nnet_output = model(features)
    id2word = read_words(args.words)
    hyps = []
    for i in range(nnet_output.shape[0]):
        hyp = decode(
            filename=args.sound_files[0],
            nnet_output=nnet_output[i],
            HL=HL,
            id2word=id2word,
        )
        hyps.append(hyp)
    s = "\n"
    for filename, hyp in zip(args.sound_files, hyps):
        words = " ".join(hyp)
        s += f"{filename}:\n{words}\n\n"
    logging.info(s)
    logging.info("Decoding Done")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/icefall/ctc/.gitignore
+++ b/icefall/ctc/.gitignore
@ -0,0 +1,2 @@
 *.pdf
 *.gv
--- a/icefall/ctc/README.md
+++ b/icefall/ctc/README.md
@ -0,0 +1,17 @@
 # Introduction
 This folder uses [kaldifst][kaldifst] for graph construction
 and decoders from [kaldi-hmm-gmm][kaldi-hmm-gmm] for CTC decoding.
 It supports only `CPU`.
 You can use
 ```bash
 pip install kaldifst kaldi-hmm-gmm
 ```
 to install the dependencies.
 [kaldi-hmm-gmm]: https://github.com/csukuangfj/kaldi-hmm-gmm
 [kaldifst]: https://github.com/k2-fsa/kaldifst
 [k2]: https://github.com/k2-fsa/k2
--- a/icefall/ctc/init.py
+++ b/icefall/ctc/init.py
@ -0,0 +1,7 @@
 from .decodable import CtcDecodable
 from .prepare_lang import (
    Lexicon,
    make_lexicon_fst_no_silence,
    make_lexicon_fst_with_silence,
 )
 from .topo import add_disambig_self_loops, add_one, build_standard_ctc_topo
--- a/icefall/ctc/decodable.py
+++ b/icefall/ctc/decodable.py
@ -0,0 +1,30 @@
 # Copyright    2023  Xiaomi Corp.        (author: Fangjun Kuang)
 import torch
 from kaldi_hmm_gmm import DecodableInterface
 class CtcDecodable(DecodableInterface):
    """This class implements the interface
    https://github.com/kaldi-asr/kaldi/blob/master/src/itf/decodable-itf.h
    """
    def __init__(self, nnet_output: torch.Tensor):
        DecodableInterface.__init__(self)
        assert nnet_output.ndim == 2, nnet_output.shape
        self.nnet_output = nnet_output
    def log_likelihood(self, frame: int, index: int) -> float:
        # Note: We need to use index - 1 here since
        # all the input labels of the H are incremented during graph
        # construction
        return self.nnet_output[frame][index - 1].item()
    def is_last_frame(self, frame: int) -> bool:
        return frame == self.nnet_output.shape[0] - 1
    def num_frames_ready(self) -> int:
        return self.nnet_output.shape[0]
    def num_indices(self) -> int:
        return self.nnet_output.shape[1]
--- a/icefall/ctc/prepare_lang.py
+++ b/icefall/ctc/prepare_lang.py
@ -0,0 +1,334 @@
 # Copyright    2023  Xiaomi Corp.        (author: Fangjun Kuang)
 """
 The lang_dir should contain the following files:
 - "lexicon_disambig.txt"
 - "tokens.txt"
 - "words.txt"
 """
 import math
 from collections import defaultdict
 from pathlib import Path
 from typing import List, Tuple
 import kaldifst
 import re
 class Lexicon:
    """Once constructed it is immutable"""
    def __init__(
        self,
        lang_dir: str,
        disambig_pattern: str = re.compile(r"^#\d+$"),
    ):
        """
        Args:
          lang_dir:
            The path to the lang directory. We expect that it contains the
            following files:
              - lexicon_disambig.txt
              - tokens.txt
              - words.txt
            The format of the above files is described below.
            (1) lexicon_disambig.txt
            Each line in the lexicon_disambig.txt has the following format:
                word token1 token2 ... tokenN
            That is, the first field is the word, the remaining fields are
            pronunciations of this word. Fields are separated by space(s).
            (2) tokens.txt
            Each line in tokens.txt has two fields separated by space(s):
                token ID
            The first field is the token symbol and the second filed is the
            integer ID of the token.
            (3) words.txt
            Each line in words.txt has two fields separated by space(s):
                word ID
            The first field is the word symbol and the second filed is the
            integer ID of the word.
          disambig_pattern:
            It contains the pattern for disambiguation symbols.
        """
        lang_dir = Path(lang_dir)
        lexicon_txt = lang_dir / "lexicon_disambig.txt"
        tokens_txt = lang_dir / "tokens.txt"
        words_txt = lang_dir / "words.txt"
        assert lexicon_txt.is_file(), lexicon_txt
        assert tokens_txt.is_file(), tokens_txt
        assert words_txt.is_file(), words_txt
        self._read_lexicon(lexicon_txt)
        self._read_tokens(tokens_txt)
        self._read_words(words_txt)
        self.disambig_pattern = disambig_pattern
        max_disambig_id = -1
        for s, i in self.token2id.items():
            if self.disambig_pattern.match(s) and i > max_disambig_id:
                max_disambig_id = i
        self.max_disambig_id = max_disambig_id
    def _read_lexicon(self, lexicon_txt: str):
        word2phones = defaultdict(list)
        with open(lexicon_txt, encoding="utf-8") as f:
            for line in f:
                word_phones = line.strip().split()
                assert len(word_phones) >= 2, (word_phones, line)
                word = word_phones[0]
                phones: str = " ".join(word_phones[1:])
                word2phones[word].append(phones)
                # We use a list here since a word may have multiple
                # pronunciations
        self.word2phones = word2phones
    def _read_tokens(self, tokens_txt):
        token2id = dict()
        id2token = dict()
        with open(tokens_txt, encoding="utf-8") as f:
            for line in f:
                token_id = line.strip().split()
                assert len(token_id) == 2, token_id
                token = token_id[0]
                idx = int(token_id[1])
                assert token not in token2id, f"Duplicate token {line}"
                assert idx not in id2token, f"Duplicate ID {line}"
                token2id[token] = idx
                id2token[idx] = token
        self.token2id = token2id
        self.id2token = id2token
    def _read_words(self, words_txt):
        word2id = dict()
        id2word = dict()
        with open(words_txt, encoding="utf-8") as f:
            for line in f:
                word_id = line.strip().split()
                assert len(word_id) == 2, word_id
                word = word_id[0]
                idx = int(word_id[1])
                assert word not in word2id, f"Duplicate token {line}"
                assert idx not in id2word, f"Duplicate ID {line}"
                word2id[word] = idx
                id2word[idx] = word
        self.word2id = word2id
        self.id2word = id2word
    def __iter__(self) -> Tuple[str, List[str]]:
        for word, phones_list in self.word2phones.items():
            for phones in phones_list:
                yield word, phones
    def __str__(self):
        return str(self.word2phones)
    @property
    def tokens(self) -> List[int]:
        """Return a list of token IDs excluding those from
        disambiguation symbols.
        Caution:
          0 is not a token ID so it is excluded from the return value.
        """
        ans = []
        for s in self.token2id:
            if not self.disambig_pattern.match(s):
                ans.append(self.token2id[s])
        if 0 in ans:
            ans.remove(0)
        ans.sort()
        return ans
 # See also
 # http://vpanayotov.blogspot.com/2012/06/kaldi-decoding-graph-construction.html
 def make_lexicon_fst_with_silence(
    lexicon: Lexicon,
    sil_prob: float = 0.5,
    sil_phone: str = "SIL",
    attach_symbol_table: bool = True,
 ) -> kaldifst.StdVectorFst:
    phone2id = lexicon.token2id
    word2id = lexicon.word2id
    assert sil_phone in phone2id
    assert sil_phone in phone2id, sil_phone
    sil_cost = -1 * math.log(sil_prob)
    no_sil_cost = -1 * math.log(1.0 - sil_prob)
    fst = kaldifst.StdVectorFst()
    start_state = fst.add_state()
    loop_state = fst.add_state()
    sil_state = fst.add_state()
    fst.start = start_state
    fst.set_final(state=loop_state, weight=0)
    fst.add_arc(
        state=start_state,
        arc=kaldifst.StdArc(
            ilabel=0,
            olabel=0,
            weight=no_sil_cost,
            nextstate=loop_state,
        ),
    )
    fst.add_arc(
        state=start_state,
        arc=kaldifst.StdArc(
            ilabel=0,
            olabel=0,
            weight=sil_cost,
            nextstate=sil_state,
        ),
    )
    fst.add_arc(
        state=sil_state,
        arc=kaldifst.StdArc(
            ilabel=phone2id[sil_phone],
            olabel=0,
            weight=0,
            nextstate=loop_state,
        ),
    )
    for word, phones in lexicon:
        phoneseq = phones.split()
        pron_cost = 0
        cur_state = loop_state
        for i in range(len(phoneseq) - 1):
            next_state = fst.add_state()
            fst.add_arc(
                state=cur_state,
                arc=kaldifst.StdArc(
                    ilabel=phone2id[phoneseq[i]],
                    olabel=word2id[word] if i == 0 else 0,
                    weight=pron_cost if i == 0 else 0,
                    nextstate=next_state,
                ),
            )
            cur_state = next_state
        i = len(phoneseq) - 1  # note: i == -1 if phoneseq is empty.
        fst.add_arc(
            state=cur_state,
            arc=kaldifst.StdArc(
                ilabel=phone2id[phoneseq[i]] if i >= 0 else 0,
                olabel=word2id[word] if i <= 0 else 0,
                weight=no_sil_cost + (pron_cost if i <= 0 else 0),
                nextstate=loop_state,
            ),
        )
        fst.add_arc(
            state=cur_state,
            arc=kaldifst.StdArc(
                ilabel=phone2id[phoneseq[i]] if i >= 0 else 0,
                olabel=word2id[word] if i <= 0 else 0,
                weight=sil_cost + (pron_cost if i <= 0 else 0),
                nextstate=sil_state,
            ),
        )
    if attach_symbol_table:
        isym = kaldifst.SymbolTable()
        for p, i in phone2id.items():
            isym.add_symbol(symbol=p, key=i)
        fst.input_symbols = isym
        osym = kaldifst.SymbolTable()
        for w, i in word2id.items():
            osym.add_symbol(symbol=w, key=i)
        fst.output_symbols = osym
    return fst
 def make_lexicon_fst_no_silence(
    lexicon: Lexicon,
    attach_symbol_table: bool = True,
 ) -> kaldifst.StdVectorFst:
    phone2id = lexicon.token2id
    word2id = lexicon.word2id
    fst = kaldifst.StdVectorFst()
    start_state = fst.add_state()
    fst.start = start_state
    fst.set_final(state=start_state, weight=0)
    for word, phones in lexicon:
        phoneseq = phones.split()
        pron_cost = 0
        cur_state = start_state
        for i in range(len(phoneseq) - 1):
            next_state = fst.add_state()
            fst.add_arc(
                state=cur_state,
                arc=kaldifst.StdArc(
                    ilabel=phone2id[phoneseq[i]],
                    olabel=word2id[word] if i == 0 else 0,
                    weight=pron_cost if i == 0 else 0,
                    nextstate=next_state,
                ),
            )
            cur_state = next_state
        i = len(phoneseq) - 1  # note: i == -1 if phoneseq is empty.
        fst.add_arc(
            state=cur_state,
            arc=kaldifst.StdArc(
                ilabel=phone2id[phoneseq[i]] if i >= 0 else 0,
                olabel=word2id[word] if i <= 0 else 0,
                weight=pron_cost if i <= 0 else 0,
                nextstate=start_state,
            ),
        )
    if attach_symbol_table:
        isym = kaldifst.SymbolTable()
        for p, i in phone2id.items():
            isym.add_symbol(symbol=p, key=i)
        fst.input_symbols = isym
        osym = kaldifst.SymbolTable()
        for w, i in word2id.items():
            osym.add_symbol(symbol=w, key=i)
        fst.output_symbols = osym
    return fst
--- a/icefall/ctc/test_ctc_topo.py
+++ b/icefall/ctc/test_ctc_topo.py
@ -0,0 +1,93 @@
 #!/usr/bin/env python3
 # Copyright      2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 from pathlib import Path
 import graphviz
 import kaldifst
 from prepare_lang import Lexicon, make_lexicon_fst_with_silence
 from topo import add_disambig_self_loops, add_one, build_standard_ctc_topo
 def test_yesno():
    lang_dir = "/Users/fangjun/open-source/icefall/egs/yesno/ASR/data/lang_phone"
    if not Path(lang_dir).is_dir():
        print(f"{lang_dir} does not exist! Skip testing")
        return
    lexicon = Lexicon(lang_dir)
    max_token_id = max(lexicon.tokens)
    H = build_standard_ctc_topo(max_token_id=max_token_id)
    isym = kaldifst.SymbolTable()
    isym.add_symbol(symbol="<blk>", key=0)
    for i in range(1, max_token_id + 1):
        isym.add_symbol(symbol=lexicon.id2token[i], key=i)
    osym = kaldifst.SymbolTable()
    osym.add_symbol(symbol="<eps>", key=0)
    for i in range(1, max_token_id + 1):
        osym.add_symbol(symbol=lexicon.id2token[i], key=i)
    H.input_symbols = isym
    H.output_symbols = osym
    fst_dot = kaldifst.draw(H, acceptor=False, portrait=True)
    source = graphviz.Source(fst_dot)
    source.render(outfile="standard_ctc_topo_yesno.pdf")
    # See the link below to visualize the above PDF
    # https://t.ly/7uXZ9
    # Now test HL
    # We need to add one to all tokens since we want to use ID 0
    # for epsilon
    add_one(H, treat_ilabel_zero_specially=False, update_olabel=True)
    add_disambig_self_loops(
        H,
        start=lexicon.token2id["#0"] + 1,
        end=lexicon.max_disambig_id,
    )
    fst_dot = kaldifst.draw(H, acceptor=False, portrait=True)
    source = graphviz.Source(fst_dot)
    source.render(outfile="standard_ctc_topo_disambig_yesno.pdf")
    L = make_lexicon_fst_with_silence(lexicon)
    # We also need to change the input labels of L
    add_one(L, treat_ilabel_zero_specially=True, update_olabel=False)
    H.output_symbols = None
    kaldifst.arcsort(H, sort_type="olabel")
    kaldifst.arcsort(L, sort_type="ilabel")
    HL = kaldifst.compose(H, L)
    lexicon.id2token[0] = "<blk>"
    lexicon.token2id["<blk>"] = 0
    isym = kaldifst.SymbolTable()
    isym.add_symbol(symbol="<eps>", key=0)
    for i in range(0, lexicon.max_disambig_id + 1):
        isym.add_symbol(symbol=lexicon.id2token[i], key=i + 1)
    osym = kaldifst.SymbolTable()
    for i, word in lexicon.id2word.items():
        osym.add_symbol(symbol=word, key=i)
    HL.input_symbols = isym
    HL.output_symbols = osym
    fst_dot = kaldifst.draw(HL, acceptor=False, portrait=True)
    source = graphviz.Source(fst_dot)
    source.render(outfile="HL_yesno.pdf")
 def main():
    test_yesno()
 if __name__ == "__main__":
    main()
--- a/icefall/ctc/test_prepare_lang.py
+++ b/icefall/ctc/test_prepare_lang.py
@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 # Copyright      2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 from pathlib import Path
 import graphviz
 import kaldifst
 from prepare_lang import Lexicon, make_lexicon_fst_with_silence
 def test_yesno():
    lang_dir = "/Users/fangjun/open-source/icefall/egs/yesno/ASR/data/lang_phone"
    if not Path(lang_dir).is_dir():
        print(f"{lang_dir} does not exist! Skip testing")
        return
    lexicon = Lexicon(lang_dir)
    L = make_lexicon_fst_with_silence(lexicon)
    isym = kaldifst.SymbolTable()
    for i, token in lexicon.id2token.items():
        isym.add_symbol(symbol=token, key=i)
    osym = kaldifst.SymbolTable()
    for i, word in lexicon.id2word.items():
        osym.add_symbol(symbol=word, key=i)
    L.input_symbols = isym
    L.output_symbols = osym
    fst_dot = kaldifst.draw(L, acceptor=False, portrait=True)
    source = graphviz.Source(fst_dot)
    source.render(outfile="L_yesno.pdf")
    # See the link below to visualize the above PDF
    # https://t.ly/jMfXW
 def main():
    test_yesno()
 if __name__ == "__main__":
    main()
--- a/icefall/ctc/topo.py
+++ b/icefall/ctc/topo.py
@ -0,0 +1,138 @@
 # Copyright      2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 import kaldifst
 # Note the name contains `standard`; it means there will be non-standard
 # topologies.
 def build_standard_ctc_topo(max_token_id: int) -> kaldifst.StdVectorFst:
    """Build a standard CTC topology.
    Args:
      Maximum valid token ID. We assume token IDs are contiguous
      and starts from 0. In other words, the vocabulary size is
      ``max_token_id + 1``. We assume the ID of the blank symbol is 0.
    """
    # Token ID starts from 0 and there are as many states as the
    # number of tokens.
    #
    # Note that epsilon is not a token and the token with ID 0 in tokens.txt
    # is not an epsilon. It means input label 0 of the resulting FST does
    # not represent an epsilon.
    #
    # You can use the function `add_one()` to modify the input/output labels
    # of the resulting FST
    num_states = max_token_id + 1
    # Step 1: Create as many states as the number of tokens.
    # Each state is a final state
    fst = kaldifst.StdVectorFst()
    for i in range(num_states):
        s = fst.add_state()
        fst.set_final(state=s, weight=0)
    # Step 2: Set state 0 as the start state.
    # We assume the ID of the blank symbol is 0.
    fst.start = 0
    # Step 3: Build a fully connected graph.
    for i in range(num_states):
        for k in range(num_states):
            fst.add_arc(
                state=i,
                arc=kaldifst.StdArc(
                    ilabel=k,
                    olabel=k if i != k else 0,  # if i==k, it is a self loop
                    weight=0,
                    nextstate=k,
                ),
            )
    # Please see ./test_ctc_topo.py if you want to know what the resulting
    # FST looks like
    return fst
 def add_one(
    fst: kaldifst.StdVectorFst,
    treat_ilabel_zero_specially: bool,
    update_olabel: bool,
 ) -> None:
    """Modify the input and output labels of the given FST in-place.
    Args:
      fst:
        The FST to be modified. It is changed in-place.
      treat_ilabel_zero_specially:
        If True, then every non-zero input label is increased by one and the
        zero input label is not changed.
        If False, then every input label is increased by one.
      update_olabel:
        If False, the output label is not changed.
        If True, then every non-zero output label is increased by one.
        In either case, output label with 0 is not changed.
    """
    for state in kaldifst.StateIterator(fst):
        for arc in kaldifst.ArcIterator(fst, state):
            # If treat_ilabel_zero_specially is False, we always change it
            # Otherwise, we only change non-zero input labels
            if treat_ilabel_zero_specially is False or arc.ilabel != 0:
                arc.ilabel += 1
            if update_olabel and arc.olabel != 0:
                arc.olabel += 1
    if fst.input_symbols is not None:
        input_symbols = kaldifst.SymbolTable()
        input_symbols.add_symbol(symbol="<eps>", key=0)
        for i in range(0, fst.input_symbols.num_symbols()):
            s = fst.input_symbols.find(i)
            input_symbols.add_symbol(symbol=s, key=i + 1)
        fst.input_symbols = input_symbols
    if update_olabel and fst.output_symbols is not None:
        output_symbols = kaldifst.SymbolTable()
        output_symbols.add_symbol(symbol="<eps>", key=0)
        for i in range(0, fst.output_symbols.num_symbols()):
            s = fst.output_symbols.find(i)
            output_symbols.add_symbol(symbol=s, key=i + 1)
        fst.output_symbols = output_symbols
 def add_disambig_self_loops(fst: kaldifst.StdVectorFst, start: int, end: int):
    """Add self-loops to each state.
    For each disambig symbol, we add a self-loop with input label 0 and output
    label diambig_id of that disambig symbol. Note that input label 0 here
    represents an epsilon.
    Args:
      fst:
        It is changed in-place.
      start:
        The ID of #0
      end:
        The ID of the last disambig symbol. For instance if there are 3
        disambig symbols ``#0``, ``#1``, and ``#2``, then ``end`` is the ID
        of ``#3``.
    """
    for state in kaldifst.StateIterator(fst):
        for i in range(start, end + 1):
            fst.add_arc(
                state=state,
                arc=kaldifst.StdArc(
                    ilabel=0,
                    olabel=i,
                    weight=0,
                    nextstate=state,
                ),
            )
    if fst.output_symbols:
        for i in range(start, end + 1):
            fst.output_symbols.add_symbol(symbol=f"#{i-start}", key=i)
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@ -25,3 +25,4 @@ onnx
 onnxmltools
 onnxruntime
 kaldifst
 kaldi-hmm-gmm
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,7 @@
 kaldifst
 kaldilm
 kaldialign
 kaldi-hmm-gmm
 sentencepiece>=0.1.96
 tensorboard
 typeguard