diff --git a/.github/scripts/run-pre-trained-conformer-ctc.sh b/.github/scripts/run-pre-trained-conformer-ctc.sh index 19cbd96fc..06efa8438 100755 --- a/.github/scripts/run-pre-trained-conformer-ctc.sh +++ b/.github/scripts/run-pre-trained-conformer-ctc.sh @@ -87,3 +87,13 @@ log "Decoding with HL on CPU with OpenFst" $repo/test_wavs/1089-134686-0001.flac \ $repo/test_wavs/1221-135766-0001.flac \ $repo/test_wavs/1221-135766-0002.flac + +log "Decoding with HLG on CPU with OpenFst" + +./conformer_ctc/jit_pretrained_decode_with_HLG.py \ + --nn-model $repo/exp/cpu_jit.pt \ + --HLG $repo/data/lang_bpe_500/HLG.fst \ + --words $repo/data/lang_bpe_500/words.txt \ + $repo/test_wavs/1089-134686-0001.flac \ + $repo/test_wavs/1221-135766-0001.flac \ + $repo/test_wavs/1221-135766-0002.flac diff --git a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py index f0326ccdf..3420c4da3 100755 --- a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py +++ b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py @@ -2,12 +2,12 @@ # Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang) """ -This file shows how to use a torchscript model for decoding with H +This file shows how to use a torchscript model for decoding with HL on CPU using OpenFST and decoders from kaldi. Usage: - ./conformer_ctc/jit_pretrained_decode_with_H.py \ + ./conformer_ctc/jit_pretrained_decode_with_HL.py \ --nn-model ./conformer_ctc/exp/cpu_jit.pt \ --HL ./data/lang_bpe_500/HL.fst \ --words ./data/lang_bpe_500/words.txt \ diff --git a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py index f0326ccdf..42129f073 100755 --- a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py +++ b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py @@ -2,14 +2,14 @@ # Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang) """ -This file shows how to use a torchscript model for decoding with H +This file shows how to use a torchscript model for decoding with HLG on CPU using OpenFST and decoders from kaldi. Usage: - ./conformer_ctc/jit_pretrained_decode_with_H.py \ + ./conformer_ctc/jit_pretrained_decode_with_HLG.py \ --nn-model ./conformer_ctc/exp/cpu_jit.pt \ - --HL ./data/lang_bpe_500/HL.fst \ + --HLG ./data/lang_bpe_500/HLG.fst \ --words ./data/lang_bpe_500/words.txt \ ./download/LibriSpeech/test-clean/1089/134686/1089-134686-0002.flac \ ./download/LibriSpeech/test-clean/1221/135766/1221-135766-0001.flac @@ -54,7 +54,7 @@ def get_parser(): help="Path to words.txt", ) - parser.add_argument("--HL", type=str, required=True, help="Path to HL.fst") + parser.add_argument("--HLG", type=str, required=True, help="Path to HLG.fst") parser.add_argument( "sound_files", @@ -108,7 +108,7 @@ def read_sound_files( def decode( filename: str, nnet_output: torch.Tensor, - HL: kaldifst, + HLG: kaldifst, id2word: Dict[int, str], ) -> List[str]: """ @@ -118,8 +118,8 @@ def decode( nnet_output: A 2-D float32 tensor of shape (num_frames, vocab_size). It contains output from log_softmax. - HL: - The HL graph. + HLG: + The HLG graph. word2token: A map mapping token ID to word string. Returns: @@ -129,7 +129,7 @@ def decode( decodable = DecodableCtc(nnet_output.cpu()) decoder_opts = FasterDecoderOptions(max_active=3000) - decoder = FasterDecoder(HL, decoder_opts) + decoder = FasterDecoder(HLG, decoder_opts) decoder.decode(decodable) if not decoder.reached_final(): @@ -168,8 +168,8 @@ def main(): model.eval() model.to(device) - logging.info(f"Loading HL from {args.HL}") - HL = kaldifst.StdVectorFst.read(args.HL) + logging.info(f"Loading HLG from {args.HLG}") + HLG = kaldifst.StdVectorFst.read(args.HLG) sample_rate = 16000 @@ -211,7 +211,7 @@ def main(): hyp = decode( filename=args.sound_files[i], nnet_output=nnet_output[i, : feature_lengths[i]], - HL=HL, + HLG=HLG, id2word=id2word, ) hyps.append(hyp) diff --git a/egs/librispeech/ASR/local/prepare_lang_fst.py b/egs/librispeech/ASR/local/prepare_lang_fst.py index e8401123f..7e3518dae 100755 --- a/egs/librispeech/ASR/local/prepare_lang_fst.py +++ b/egs/librispeech/ASR/local/prepare_lang_fst.py @@ -8,6 +8,7 @@ tokens.txt, and words.txt and generates the following files: - H.fst - HL.fst + - HLG.fst Note that saved files are in OpenFst binary format. @@ -56,9 +57,109 @@ def get_args(): help="True if the lexicon has silence.", ) + parser.add_argument( + "--ngram-G", + type=str, + help="""If not empty, it is the filename of G used to build HLG. + For instance, --ngram-G=./data/lm/G_3_fst.txt + """, + ) + return parser.parse_args() +def build_HL( + H: kaldifst.StdVectorFst, + L: kaldifst.StdVectorFst, + has_silence: bool, + lexicon: Lexicon, +) -> kaldifst.StdVectorFst: + if has_silence: + # We also need to change the input labels of L + add_one(L, treat_ilabel_zero_specially=True, update_olabel=False) + else: + add_one(L, treat_ilabel_zero_specially=False, update_olabel=False) + + # Invoke add_disambig_self_loops() so that it eats the disambig symbols + # from L after composition + add_disambig_self_loops( + H, + start=lexicon.token2id["#0"] + 1, + end=lexicon.max_disambig_id + 1, + ) + + kaldifst.arcsort(H, sort_type="olabel") + kaldifst.arcsort(L, sort_type="ilabel") + + HL = kaldifst.compose(H, L) + kaldifst.determinize_star(HL) + + disambig0 = lexicon.token2id["#0"] + 1 + max_disambig = lexicon.max_disambig_id + 1 + for state in kaldifst.StateIterator(HL): + for arc in kaldifst.ArcIterator(HL, state): + # If treat_ilabel_zero_specially is False, we always change it + # Otherwise, we only change non-zero input labels + if disambig0 <= arc.ilabel <= max_disambig: + arc.ilabel = 0 + + # Note: We are not composing L with G, so there is no need to add + # self-loops to L to handle #0 + + return HL + + +def build_HLG( + H: kaldifst.StdVectorFst, + L: kaldifst.StdVectorFst, + G: kaldifst.StdVectorFst, + has_silence: bool, + lexicon: Lexicon, +) -> kaldifst.StdVectorFst: + if has_silence: + # We also need to change the input labels of L + add_one(L, treat_ilabel_zero_specially=True, update_olabel=False) + else: + add_one(L, treat_ilabel_zero_specially=False, update_olabel=False) + + # add-self-loops + token_disambig0 = lexicon.token2id["#0"] + 1 + word_disambig0 = lexicon.word2id["#0"] + + kaldifst.add_self_loops(L, isyms=[token_disambig0], osyms=[word_disambig0]) + + kaldifst.arcsort(L, sort_type="olabel") + kaldifst.arcsort(G, sort_type="ilabel") + LG = kaldifst.compose(L, G) + kaldifst.determinize_star(LG) + kaldifst.minimize_encoded(LG) + + kaldifst.arcsort(LG, sort_type="ilabel") + + # Invoke add_disambig_self_loops() so that it eats the disambig symbols + # from L after composition + add_disambig_self_loops( + H, + start=lexicon.token2id["#0"] + 1, + end=lexicon.max_disambig_id + 1, + ) + + kaldifst.arcsort(H, sort_type="olabel") + + HLG = kaldifst.compose(H, LG) + kaldifst.determinize_star(HLG) + + disambig0 = lexicon.token2id["#0"] + 1 + max_disambig = lexicon.max_disambig_id + 1 + for state in kaldifst.StateIterator(HLG): + for arc in kaldifst.ArcIterator(HLG, state): + # If treat_ilabel_zero_specially is False, we always change it + # Otherwise, we only change non-zero input labels + if disambig0 <= arc.ilabel <= max_disambig: + arc.ilabel = 0 + return HLG + + def main(): args = get_args() lang_dir = args.lang_dir @@ -82,43 +183,29 @@ def main(): else: L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False) - if args.has_silence: - # We also need to change the input labels of L - add_one(L, treat_ilabel_zero_specially=True, update_olabel=False) - else: - add_one(L, treat_ilabel_zero_specially=False, update_olabel=False) - - # Invoke add_disambig_self_loops() so that it eats the disambig symbols - # from L after composition - add_disambig_self_loops( - H, - start=lexicon.token2id["#0"] + 1, - end=lexicon.max_disambig_id + 1, - ) - with open("H_1.fst.txt", "w") as f: - print(H, file=f) - - kaldifst.arcsort(H, sort_type="olabel") - kaldifst.arcsort(L, sort_type="ilabel") - logging.info("Building HL") - HL = kaldifst.compose(H, L) - kaldifst.determinize_star(HL) - - disambig0 = lexicon.token2id["#0"] + 1 - max_disambig = lexicon.max_disambig_id + 1 - for state in kaldifst.StateIterator(HL): - for arc in kaldifst.ArcIterator(HL, state): - # If treat_ilabel_zero_specially is False, we always change it - # Otherwise, we only change non-zero input labels - if disambig0 <= arc.ilabel <= max_disambig: - arc.ilabel = 0 - - # Note: We are not composing L with G, so there is no need to add - # self-loops to L to handle #0 - + HL = build_HL( + H=H.copy(), + L=L.copy(), + has_silence=args.has_silence, + lexicon=lexicon, + ) HL.write(f"{lang_dir}/HL.fst") + if not args.ngram_G: + logging.info("Skip building HLG") + return + + logging.info("Building HLG") + with open(args.ngram_G) as f: + G = kaldifst.compile( + s=f.read(), + acceptor=False, + ) + + HLG = build_HLG(H=H, L=L, G=G, has_silence=args.has_silence, lexicon=lexicon) + HLG.write(f"{lang_dir}/HLG.fst") + if __name__ == "__main__": formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"