diff --git a/.github/scripts/run-pre-trained-conformer-ctc.sh b/.github/scripts/run-pre-trained-conformer-ctc.sh index a82d85fb2..ea400c628 100755 --- a/.github/scripts/run-pre-trained-conformer-ctc.sh +++ b/.github/scripts/run-pre-trained-conformer-ctc.sh @@ -8,7 +8,7 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } -cd egs/librispeech/ASR +pushd egs/librispeech/ASR # repo_url=https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 @@ -112,3 +112,81 @@ log "Decoding with HLG on CPU with OpenFst" $repo/test_wavs/1089-134686-0001.wav \ $repo/test_wavs/1221-135766-0001.wav \ $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo + +popd + +log "Test aishell" + +pushd egs/aishell/ASR + +repo_url=https://huggingface.co/csukuangfj/icefall_asr_aishell_conformer_ctc +log "Downloading pre-trained model from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +repo=$(basename $repo_url) +pushd $repo + +git lfs pull --include "exp/pretrained.pt" +git lfs pull --include "data/lm/G_3_gram_char.fst.txt" + +popd + +log "Display test files" +tree $repo/ +ls -lh $repo/test_wavs/*.wav + +log "CTC decoding" + +log "Exporting model with torchscript" + +pushd $repo/exp +ln -s pretrained.pt epoch-99.pt +popd + +./conformer_ctc/export.py \ + --epoch 99 \ + --avg 1 \ + --exp-dir $repo/exp \ + --tokens $repo/data/lang_char/tokens.txt \ + --jit 1 + +ls -lh $repo/exp + +log "Generating H.fst, HL.fst" + +./local/prepare_lang_fst.py --lang-dir $repo/data/lang_char --ngram-G $repo/data/lm/G_3_gram_char.fst.txt + +ls -lh $repo/data/lang_char + +log "Decoding with H on CPU with OpenFst" + +./conformer_ctc/jit_pretrained_decode_with_H.py \ + --nn-model $repo/exp/cpu_jit.pt \ + --H $repo/data/lang_char/H.fst \ + --tokens $repo/data/lang_char/tokens.txt \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav + +log "Decoding with HL on CPU with OpenFst" + +./conformer_ctc/jit_pretrained_decode_with_HL.py \ + --nn-model $repo/exp/cpu_jit.pt \ + --HL $repo/data/lang_char/HL.fst \ + --words $repo/data/lang_char/words.txt \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav + +log "Decoding with HLG on CPU with OpenFst" + +./conformer_ctc/jit_pretrained_decode_with_HLG.py \ + --nn-model $repo/exp/cpu_jit.pt \ + --HLG $repo/data/lang_char/HLG.fst \ + --words $repo/data/lang_char/words.txt \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav + +rm -rf $repo diff --git a/.github/workflows/run-yesno-recipe.yml b/.github/workflows/run-yesno-recipe.yml index 400595749..7d55a50e1 100644 --- a/.github/workflows/run-yesno-recipe.yml +++ b/.github/workflows/run-yesno-recipe.yml @@ -60,7 +60,7 @@ jobs: - name: Install Python dependencies run: | - grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install + grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install pip uninstall -y protobuf pip install --no-binary protobuf protobuf==3.20.* diff --git a/egs/aishell/ASR/conformer_ctc/export.py b/egs/aishell/ASR/conformer_ctc/export.py old mode 100644 new mode 100755 index 1df3cfdc2..49871d437 --- a/egs/aishell/ASR/conformer_ctc/export.py +++ b/egs/aishell/ASR/conformer_ctc/export.py @@ -23,12 +23,12 @@ import argparse import logging from pathlib import Path +import k2 import torch from conformer import Conformer from icefall.checkpoint import average_checkpoints, load_checkpoint -from icefall.lexicon import Lexicon -from icefall.utils import AttributeDict, str2bool +from icefall.utils import AttributeDict, num_tokens, str2bool def get_parser(): @@ -63,11 +63,10 @@ def get_parser(): ) parser.add_argument( - "--lang-dir", + "--tokens", type=str, - default="data/lang_char", - help="""It contains language related input files such as "lexicon.txt" - """, + required=True, + help="Path to the tokens.txt.", ) parser.add_argument( @@ -98,16 +97,16 @@ def get_params() -> AttributeDict: def main(): args = get_parser().parse_args() args.exp_dir = Path(args.exp_dir) - args.lang_dir = Path(args.lang_dir) params = get_params() params.update(vars(args)) - logging.info(params) + # Load tokens.txt here + token_table = k2.SymbolTable.from_file(params.tokens) - lexicon = Lexicon(params.lang_dir) - max_token_id = max(lexicon.tokens) - num_classes = max_token_id + 1 # +1 for the blank + num_classes = num_tokens(token_table) + 1 # +1 for the blank + + logging.info(params) device = torch.device("cpu") if torch.cuda.is_available(): diff --git a/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_H.py b/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_H.py new file mode 120000 index 000000000..896b78aef --- /dev/null +++ b/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_H.py @@ -0,0 +1 @@ +../../../librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_H.py \ No newline at end of file diff --git a/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py b/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py new file mode 120000 index 000000000..aa1b6073d --- /dev/null +++ b/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py @@ -0,0 +1 @@ +../../../librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py \ No newline at end of file diff --git a/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py b/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py new file mode 120000 index 000000000..0cf42ce30 --- /dev/null +++ b/egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py @@ -0,0 +1 @@ +../../../librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py \ No newline at end of file diff --git a/egs/aishell/ASR/conformer_ctc/test_transformer.py b/egs/aishell/ASR/conformer_ctc/test_transformer.py old mode 100644 new mode 100755 diff --git a/egs/aishell/ASR/local/prepare_lang_fst.py b/egs/aishell/ASR/local/prepare_lang_fst.py new file mode 120000 index 000000000..c5787c534 --- /dev/null +++ b/egs/aishell/ASR/local/prepare_lang_fst.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang_fst.py \ No newline at end of file diff --git a/egs/aishell/ASR/prepare.sh b/egs/aishell/ASR/prepare.sh index ff8e1301d..9de060e73 100755 --- a/egs/aishell/ASR/prepare.sh +++ b/egs/aishell/ASR/prepare.sh @@ -143,6 +143,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then ./local/prepare_lang.py --lang-dir $lang_phone_dir fi + # Train a bigram P for MMI training if [ ! -f $lang_phone_dir/transcript_words.txt ]; then log "Generate data to train phone based bigram P" @@ -203,6 +204,10 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then if [ ! -f $lang_char_dir/L_disambig.pt ]; then ./local/prepare_char.py --lang-dir $lang_char_dir fi + + if [ ! -f $lang_char_dir/HLG.fst ]; then + ./local/prepare_lang_fst.py --lang-dir $lang_phone_dir --ngram-G ./data/lm/G_3_gram.fst.txt + fi fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then diff --git a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_H.py b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_H.py index b52c7cfed..8dd856a4e 100755 --- a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_H.py +++ b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_H.py @@ -7,6 +7,8 @@ on CPU using OpenFST and decoders from kaldi. Usage: +(1) LibriSpeech conformer_ctc + ./conformer_ctc/jit_pretrained_decode_with_H.py \ --nn-model ./conformer_ctc/exp/cpu_jit.pt \ --H ./data/lang_bpe_500/H.fst \ @@ -14,6 +16,17 @@ Usage: ./download/LibriSpeech/test-clean/1089/134686/1089-134686-0002.flac \ ./download/LibriSpeech/test-clean/1221/135766/1221-135766-0001.flac + +(2) AIShell conformer_ctc + + ./conformer_ctc/jit_pretrained_decode_with_H.py \ + --nn-model ./conformer_ctc/exp/cpu_jit.pt \ + --H ./data/lang_char/H.fst \ + --tokens ./data/lang_char/tokens.txt \ + ./BAC009S0764W0121.wav \ + ./BAC009S0764W0122.wav \ + ./BAC009S0764W0123.wav + Note that to generate ./conformer_ctc/exp/cpu_jit.pt, you can use ./export.py --jit 1 """ @@ -23,12 +36,11 @@ import logging import math from typing import Dict, List -import kaldi_hmm_gmm import kaldifeat import kaldifst import torch import torchaudio -from kaldi_hmm_gmm import DecodableCtc, FasterDecoder, FasterDecoderOptions +from kaldi_decoder import DecodableCtc, FasterDecoder, FasterDecoderOptions from torch.nn.utils.rnn import pad_sequence diff --git a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py index 3420c4da3..796e19661 100755 --- a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py +++ b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py @@ -7,6 +7,8 @@ on CPU using OpenFST and decoders from kaldi. Usage: +(1) LibriSpeech conformer_ctc + ./conformer_ctc/jit_pretrained_decode_with_HL.py \ --nn-model ./conformer_ctc/exp/cpu_jit.pt \ --HL ./data/lang_bpe_500/HL.fst \ @@ -14,6 +16,17 @@ Usage: ./download/LibriSpeech/test-clean/1089/134686/1089-134686-0002.flac \ ./download/LibriSpeech/test-clean/1221/135766/1221-135766-0001.flac +(2) AIShell conformer_ctc + + ./conformer_ctc/jit_pretrained_decode_with_HL.py \ + --nn-model ./conformer_ctc/exp/cpu_jit.pt \ + --HL ./data/lang_char/HL.fst \ + --words ./data/lang_char/words.txt \ + ./BAC009S0764W0121.wav \ + ./BAC009S0764W0122.wav \ + ./BAC009S0764W0123.wav + + Note that to generate ./conformer_ctc/exp/cpu_jit.pt, you can use ./export.py --jit 1 """ @@ -23,12 +36,11 @@ import logging import math from typing import Dict, List -import kaldi_hmm_gmm import kaldifeat import kaldifst import torch import torchaudio -from kaldi_hmm_gmm import DecodableCtc, FasterDecoder, FasterDecoderOptions +from kaldi_decoder import DecodableCtc, FasterDecoder, FasterDecoderOptions from torch.nn.utils.rnn import pad_sequence diff --git a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py index 42129f073..0024d5c9c 100755 --- a/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py +++ b/egs/librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py @@ -7,6 +7,8 @@ on CPU using OpenFST and decoders from kaldi. Usage: +(1) LibriSpeech conformer_ctc + ./conformer_ctc/jit_pretrained_decode_with_HLG.py \ --nn-model ./conformer_ctc/exp/cpu_jit.pt \ --HLG ./data/lang_bpe_500/HLG.fst \ @@ -14,6 +16,16 @@ Usage: ./download/LibriSpeech/test-clean/1089/134686/1089-134686-0002.flac \ ./download/LibriSpeech/test-clean/1221/135766/1221-135766-0001.flac +(2) AIShell conformer_ctc + + ./conformer_ctc/jit_pretrained_decode_with_HLG.py \ + --nn-model ./conformer_ctc/exp/cpu_jit.pt \ + --HLG ./data/lang_char/HLG.fst \ + --words ./data/lang_char/words.txt \ + ./BAC009S0764W0121.wav \ + ./BAC009S0764W0122.wav \ + ./BAC009S0764W0123.wav + Note that to generate ./conformer_ctc/exp/cpu_jit.pt, you can use ./export.py --jit 1 """ @@ -23,12 +35,11 @@ import logging import math from typing import Dict, List -import kaldi_hmm_gmm import kaldifeat import kaldifst import torch import torchaudio -from kaldi_hmm_gmm import DecodableCtc, FasterDecoder, FasterDecoderOptions +from kaldi_decoder import DecodableCtc, FasterDecoder, FasterDecoderOptions from torch.nn.utils.rnn import pad_sequence diff --git a/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_H.py b/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_H.py index 209ab477a..ff8c742af 100755 --- a/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_H.py +++ b/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_H.py @@ -28,7 +28,7 @@ import kaldifeat import kaldifst import torch import torchaudio -from kaldi_hmm_gmm import DecodableCtc, FasterDecoder, FasterDecoderOptions +from kaldi_decoder import DecodableCtc, FasterDecoder, FasterDecoderOptions from torch.nn.utils.rnn import pad_sequence diff --git a/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_HL.py b/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_HL.py index 74864e17d..05ba74f9a 100755 --- a/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_HL.py +++ b/egs/yesno/ASR/tdnn/jit_pretrained_decode_with_HL.py @@ -28,7 +28,7 @@ import kaldifeat import kaldifst import torch import torchaudio -from kaldi_hmm_gmm import DecodableCtc, FasterDecoder, FasterDecoderOptions +from kaldi_decoder import DecodableCtc, FasterDecoder, FasterDecoderOptions from torch.nn.utils.rnn import pad_sequence diff --git a/icefall/ctc/README.md b/icefall/ctc/README.md index 07b0ff8cd..0096bc096 100644 --- a/icefall/ctc/README.md +++ b/icefall/ctc/README.md @@ -1,17 +1,17 @@ # Introduction This folder uses [kaldifst][kaldifst] for graph construction -and decoders from [kaldi-hmm-gmm][kaldi-hmm-gmm] for CTC decoding. +and decoders from [kaldi-decoder][kaldi-decoder] for CTC decoding. It supports only `CPU`. You can use ```bash -pip install kaldifst kaldi-hmm-gmm +pip install kaldifst kaldi-decoder ``` to install the dependencies. -[kaldi-hmm-gmm]: https://github.com/csukuangfj/kaldi-hmm-gmm +[kaldi-decoder]: https://github.com/i2-fsa/kaldi-decoder [kaldifst]: https://github.com/k2-fsa/kaldifst [k2]: https://github.com/k2-fsa/k2 diff --git a/requirements.txt b/requirements.txt index c031d683c..5a8326619 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ kaldifst kaldilm kaldialign -kaldi-hmm-gmm +kaldi-decoder sentencepiece>=0.1.96 tensorboard typeguard