From 21096e99d8f0cd5ddd9dd03a35b44c7334848687 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 10 Nov 2021 14:32:52 +0800 Subject: [PATCH] Update result for the librispeech recipe using vocab size 500 and att rate 0.8 (#113) * Update RESULTS using vocab size 500, att rate 0.8 * Update README. * Refactoring. Since FSAs in an Nbest object are linear in structure, we can add the scores of a path to compute the total scores. * Update documentation. * Change default vocab size from 5000 to 500. --- README.md | 13 +- .../recipes/librispeech/conformer_ctc.rst | 603 ++++++++++++------ egs/librispeech/ASR/RESULTS.md | 59 ++ egs/librispeech/ASR/conformer_ctc/ali.py | 2 +- egs/librispeech/ASR/conformer_ctc/decode.py | 25 +- egs/librispeech/ASR/conformer_ctc/export.py | 2 +- .../ASR/conformer_ctc/pretrained.py | 2 +- egs/librispeech/ASR/conformer_ctc/train.py | 14 +- .../ASR/conformer_ctc/transformer.py | 14 +- icefall/decode.py | 49 +- 10 files changed, 525 insertions(+), 258 deletions(-) diff --git a/README.md b/README.md index 298feca2e..51c0cee32 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,10 @@ and [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc]. The best WER we currently have is: -||test-clean|test-other| -|--|--|--| -|WER| 2.57% | 5.94% | +| | test-clean | test-other | +|-----|------------|------------| +| WER | 2.42 | 5.73 | + We provide a Colab notebook to run a pre-trained conformer CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing) @@ -49,9 +50,9 @@ We provide a Colab notebook to run a pre-trained conformer CTC model: [![Open In The WER for this model is: -||test-clean|test-other| -|--|--|--| -|WER| 6.59% | 17.69% | +| | test-clean | test-other | +|-----|------------|------------| +| WER | 6.59 | 17.69 | We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kNmDXNMwREi0rZGAOIAOJo93REBuOTcd?usp=sharing) diff --git a/docs/source/recipes/librispeech/conformer_ctc.rst b/docs/source/recipes/librispeech/conformer_ctc.rst index 0c3c76c23..5acc4092b 100644 --- a/docs/source/recipes/librispeech/conformer_ctc.rst +++ b/docs/source/recipes/librispeech/conformer_ctc.rst @@ -304,9 +304,6 @@ The commonly used options are: $ cd egs/librispeech/ASR $ ./conformer_ctc/decode.py --method ctc-decoding --max-duration 300 # Caution: The above command is tested with a model with vocab size 500. - # The default settings in the master will not work. - # Please see https://github.com/k2-fsa/icefall/issues/103 - # We will fix it later and delete this note. And the following command uses attention decoder for rescoring: @@ -386,7 +383,7 @@ Pre-trained Model ----------------- We have uploaded a pre-trained model to -``_. +``_ We describe how to use the pre-trained model to transcribe a sound file or multiple sound files in the following. @@ -408,14 +405,13 @@ The following commands describe how to download the pre-trained model: .. code-block:: bash $ cd egs/librispeech/ASR - $ mkdir tmp - $ cd tmp - $ git lfs install - $ git clone https://huggingface.co/pkufool/icefall_asr_librispeech_conformer_ctc + $ git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + $ cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + $ git lfs pull .. CAUTION:: - You have to use ``git lfs`` to download the pre-trained model. + You have to use ``git lfs pull`` to download the pre-trained model. Otherwise, you will have the following issue when running ``decode.py``: .. code-block:: @@ -426,10 +422,9 @@ The following commands describe how to download the pre-trained model: .. code-block:: bash - cd icefall_asr_librispeech_conformer_ctc + cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 git lfs pull - .. CAUTION:: In order to use this pre-trained model, your k2 version has to be v1.9 or later. @@ -439,46 +434,52 @@ After downloading, you will have the following files: .. code-block:: bash $ cd egs/librispeech/ASR - $ tree tmp + $ tree icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 .. code-block:: bash - tmp - `-- icefall_asr_librispeech_conformer_ctc - |-- README.md - |-- data - | |-- lang_bpe - | | |-- HLG.pt - | | |-- bpe.model - | | |-- tokens.txt - | | `-- words.txt - | `-- lm - | `-- G_4_gram.pt - |-- exp - | `-- pretrained.pt - `-- test_wavs - |-- 1089-134686-0001.flac - |-- 1221-135766-0001.flac - |-- 1221-135766-0002.flac - `-- trans.txt + icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + |-- README.md + |-- data + | |-- lang_bpe_500 + | | |-- HLG.pt + | | |-- HLG_modified.pt + | | |-- bpe.model + | | |-- tokens.txt + | | `-- words.txt + | `-- lm + | `-- G_4_gram.pt + |-- exp + | |-- cpu_jit.pt + | `-- pretrained.pt + |-- log + | `-- log-decode-2021-11-09-17-38-28 + `-- test_wavs + |-- 1089-134686-0001.wav + |-- 1221-135766-0001.wav + |-- 1221-135766-0002.wav + `-- trans.txt - 6 directories, 11 files **File descriptions**: - - ``data/lang_bpe/HLG.pt`` + - ``data/lang_bpe_500/HLG.pt`` It is the decoding graph. - - ``data/lang_bpe/bpe.model`` + - ``data/lang_bpe_500/HLG_modified.pt`` + + It uses a modified CTC topology while building HLG. + + - ``data/lang_bpe_500/bpe.model`` It is a sentencepiece model. You can use it to reproduce our results. - - ``data/lang_bpe/tokens.txt`` + - ``data/lang_bpe_500/tokens.txt`` It contains tokens and their IDs, generated from ``bpe.model``. Provided only for convenience so that you can look up the SOS/EOS ID easily. - - ``data/lang_bpe/words.txt`` + - ``data/lang_bpe_500/words.txt`` It contains words and their IDs. @@ -489,49 +490,55 @@ After downloading, you will have the following files: - ``exp/pretrained.pt`` It contains pre-trained model parameters, obtained by averaging - checkpoints from ``epoch-15.pt`` to ``epoch-34.pt``. + checkpoints from ``epoch-23.pt`` to ``epoch-77.pt``. Note: We have removed optimizer ``state_dict`` to reduce file size. - - ``test_waves/*.flac`` + - ``exp/cpu_jit.pt`` + + It contains torch scripted model that can be deployed in C++. + + - ``test_wavs/*.wav`` It contains some test sound files from LibriSpeech ``test-clean`` dataset. - - ``test_waves/trans.txt`` + - ``test_wavs/trans.txt`` - It contains the reference transcripts for the sound files in ``test_waves/``. + It contains the reference transcripts for the sound files in ``test_wavs/``. The information of the test sound files is listed below: .. code-block:: bash - $ soxi tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/*.flac + $ soxi icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/*.wav - Input File : 'tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac' + Input File : 'icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav' Channels : 1 Sample Rate : 16000 Precision : 16-bit Duration : 00:00:06.62 = 106000 samples ~ 496.875 CDDA sectors - File Size : 116k - Bit Rate : 140k - Sample Encoding: 16-bit FLAC + File Size : 212k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM - Input File : 'tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac' + + Input File : 'icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav' Channels : 1 Sample Rate : 16000 Precision : 16-bit Duration : 00:00:16.71 = 267440 samples ~ 1253.62 CDDA sectors - File Size : 343k - Bit Rate : 164k - Sample Encoding: 16-bit FLAC + File Size : 535k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM - Input File : 'tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac' + + Input File : 'icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav' Channels : 1 Sample Rate : 16000 Precision : 16-bit Duration : 00:00:04.83 = 77200 samples ~ 361.875 CDDA sectors - File Size : 105k - Bit Rate : 174k - Sample Encoding: 16-bit FLAC + File Size : 154k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM Total Duration of 3 files: 00:00:28.16 @@ -564,38 +571,37 @@ The command to run CTC decoding is: $ cd egs/librispeech/ASR $ ./conformer_ctc/pretrained.py \ - --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ - --bpe-model ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/bpe.model \ - --method ctc-decoding \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \ + --bpe-model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/bpe.model \ + --method ctc-decoding \ + --num-classes 500 \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav The output is given below: .. code-block:: - 2021-10-13 11:21:50,896 INFO [pretrained.py:236] device: cuda:0 - 2021-10-13 11:21:50,896 INFO [pretrained.py:238] Creating model - 2021-10-13 11:21:56,669 INFO [pretrained.py:255] Constructing Fbank computer - 2021-10-13 11:21:56,670 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] - 2021-10-13 11:21:56,683 INFO [pretrained.py:271] Decoding started - 2021-10-13 11:21:57,341 INFO [pretrained.py:290] Building CTC topology - 2021-10-13 11:21:57,625 INFO [lexicon.py:113] Loading pre-compiled tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/Linv.pt - 2021-10-13 11:21:57,679 INFO [pretrained.py:299] Loading BPE model - 2021-10-13 11:22:00,076 INFO [pretrained.py:314] Use CTC decoding - 2021-10-13 11:22:00,087 INFO [pretrained.py:400] - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: - AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + 2021-11-10 12:12:29,554 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 0, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': None, 'HLG': None, 'bpe_model': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/bpe.model', 'method': 'ctc-decoding', 'G': None, 'num_paths': 100, 'ngram_lm_scale': 1.3, 'attention_decoder_scale': 1.2, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}} + 2021-11-10 12:12:29,554 INFO [pretrained.py:266] device: cuda:0 + 2021-11-10 12:12:29,554 INFO [pretrained.py:268] Creating model + 2021-11-10 12:12:35,600 INFO [pretrained.py:285] Constructing Fbank computer + 2021-11-10 12:12:35,601 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'] + 2021-11-10 12:12:35,758 INFO [pretrained.py:301] Decoding started + 2021-11-10 12:12:36,025 INFO [pretrained.py:319] Use CTC decoding + 2021-11-10 12:12:36,204 INFO [pretrained.py:425] + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROFFELS - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: - GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED - BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED B + OSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav: YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION - 2021-10-13 11:22:00,087 INFO [pretrained.py:402] Decoding Done + 2021-11-10 12:12:36,204 INFO [pretrained.py:427] Decoding Done HLG decoding ^^^^^^^^^^^^ @@ -608,36 +614,39 @@ The command to run HLG decoding is: $ cd egs/librispeech/ASR $ ./conformer_ctc/pretrained.py \ - --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ - --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ - --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \ + --words-file ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \ + --method 1best \ + --num-classes 500 \ + --HLG ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav The output is given below: .. code-block:: - 2021-10-13 11:25:19,458 INFO [pretrained.py:236] device: cuda:0 - 2021-10-13 11:25:19,458 INFO [pretrained.py:238] Creating model - 2021-10-13 11:25:25,342 INFO [pretrained.py:255] Constructing Fbank computer - 2021-10-13 11:25:25,343 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] - 2021-10-13 11:25:25,356 INFO [pretrained.py:271] Decoding started - 2021-10-13 11:25:26,026 INFO [pretrained.py:327] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt - 2021-10-13 11:25:33,735 INFO [pretrained.py:359] Use HLG decoding - 2021-10-13 11:25:34,013 INFO [pretrained.py:400] - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: + 2021-11-10 13:33:03,723 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 0, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt', 'HLG': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt', 'bpe_model': None, 'method': '1best', 'G': None, 'num_paths': 100, 'ngram_lm_scale': 1.3, 'attention_decoder_scale': 1.2, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}} + 2021-11-10 13:33:03,723 INFO [pretrained.py:266] device: cuda:0 + 2021-11-10 13:33:03,723 INFO [pretrained.py:268] Creating model + 2021-11-10 13:33:09,775 INFO [pretrained.py:285] Constructing Fbank computer + 2021-11-10 13:33:09,776 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'] + 2021-11-10 13:33:09,881 INFO [pretrained.py:301] Decoding started + 2021-11-10 13:33:09,951 INFO [pretrained.py:352] Loading HLG from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt + 2021-11-10 13:33:13,234 INFO [pretrained.py:384] Use HLG decoding + 2021-11-10 13:33:13,571 INFO [pretrained.py:425] + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav: AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: - GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED - BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav: YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION - 2021-10-13 11:25:34,014 INFO [pretrained.py:402] Decoding Done + 2021-11-10 13:33:13,571 INFO [pretrained.py:427] Decoding Done + HLG decoding + LM rescoring ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -650,41 +659,43 @@ The command to run HLG decoding + LM rescoring is: .. code-block:: bash $ cd egs/librispeech/ASR - $ ./conformer_ctc/pretrained.py \ - --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ - --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ - --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ - --method whole-lattice-rescoring \ - --G ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt \ - --ngram-lm-scale 0.8 \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + ./conformer_ctc/pretrained.py \ + --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \ + --words-file ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \ + --method whole-lattice-rescoring \ + --num-classes 500 \ + --HLG ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \ + --G ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \ + --ngram-lm-scale 1.0 \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav Its output is: .. code-block:: - 2021-10-13 11:28:19,129 INFO [pretrained.py:236] device: cuda:0 - 2021-10-13 11:28:19,129 INFO [pretrained.py:238] Creating model - 2021-10-13 11:28:23,531 INFO [pretrained.py:255] Constructing Fbank computer - 2021-10-13 11:28:23,532 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] - 2021-10-13 11:28:23,544 INFO [pretrained.py:271] Decoding started - 2021-10-13 11:28:24,141 INFO [pretrained.py:327] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt - 2021-10-13 11:28:30,752 INFO [pretrained.py:338] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt - 2021-10-13 11:28:48,308 INFO [pretrained.py:364] Use HLG decoding + LM rescoring - 2021-10-13 11:28:48,815 INFO [pretrained.py:400] - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: + 2021-11-10 13:39:55,857 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 0, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt', 'HLG': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt', 'bpe_model': None, 'method': 'whole-lattice-rescoring', 'G': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt', 'num_paths': 100, 'ngram_lm_scale': 1.0, 'attention_decoder_scale': 1.2, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-$it-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}} + 2021-11-10 13:39:55,858 INFO [pretrained.py:266] device: cuda:0 + 2021-11-10 13:39:55,858 INFO [pretrained.py:268] Creating model + 2021-11-10 13:40:01,979 INFO [pretrained.py:285] Constructing Fbank computer + 2021-11-10 13:40:01,980 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'] + 2021-11-10 13:40:02,055 INFO [pretrained.py:301] Decoding started + 2021-11-10 13:40:02,117 INFO [pretrained.py:352] Loading HLG from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt + 2021-11-10 13:40:05,051 INFO [pretrained.py:363] Loading G from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt + 2021-11-10 13:40:18,959 INFO [pretrained.py:389] Use HLG decoding + LM rescoring + 2021-11-10 13:40:19,546 INFO [pretrained.py:425] + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav: AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: - GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED - BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav: YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION - 2021-10-13 11:28:48,815 INFO [pretrained.py:402] Decoding Done + 2021-11-10 13:40:19,546 INFO [pretrained.py:427] Decoding Done + HLG decoding + LM rescoring + attention decoder rescoring ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -699,45 +710,72 @@ The command to run HLG decoding + LM rescoring + attention decoder rescoring is: $ cd egs/librispeech/ASR $ ./conformer_ctc/pretrained.py \ - --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ - --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ - --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ - --method attention-decoder \ - --G ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt \ - --ngram-lm-scale 1.3 \ - --attention-decoder-scale 1.2 \ - --nbest-scale 0.5 \ - --num-paths 100 \ - --sos-id 1 \ - --eos-id 1 \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \ + --words-file ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \ + --method attention-decoder \ + --num-classes 500 \ + --HLG ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \ + --G ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \ + --ngram-lm-scale 2.0 \ + --attention-decoder-scale 2.0 \ + --nbest-scale 0.5 \ + --num-paths 100 \ + --sos-id 1 \ + --eos-id 1 \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav The output is below: .. code-block:: - 2021-10-13 11:29:50,106 INFO [pretrained.py:236] device: cuda:0 - 2021-10-13 11:29:50,106 INFO [pretrained.py:238] Creating model - 2021-10-13 11:29:56,063 INFO [pretrained.py:255] Constructing Fbank computer - 2021-10-13 11:29:56,063 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] - 2021-10-13 11:29:56,077 INFO [pretrained.py:271] Decoding started - 2021-10-13 11:29:56,770 INFO [pretrained.py:327] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt - 2021-10-13 11:30:04,023 INFO [pretrained.py:338] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt - 2021-10-13 11:30:18,163 INFO [pretrained.py:372] Use HLG + LM rescoring + attention decoder rescoring - 2021-10-13 11:30:19,367 INFO [pretrained.py:400] - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: + 2021-11-10 13:43:45,598 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 6, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt', 'HLG': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt', 'bpe_model': None, 'method': 'attention-decoder', 'G': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt', 'num_paths': 100, 'ngram_lm_scale': 2.0, 'attention_decoder_scale': 2.0, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}} + 2021-11-10 13:43:45,599 INFO [pretrained.py:266] device: cuda:0 + 2021-11-10 13:43:45,599 INFO [pretrained.py:268] Creating model + 2021-11-10 13:43:51,833 INFO [pretrained.py:285] Constructing Fbank computer + 2021-11-10 13:43:51,834 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'] + 2021-11-10 13:43:51,915 INFO [pretrained.py:301] Decoding started + 2021-11-10 13:43:52,076 INFO [pretrained.py:352] Loading HLG from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt + 2021-11-10 13:43:55,110 INFO [pretrained.py:363] Loading G from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt + 2021-11-10 13:44:09,329 INFO [pretrained.py:397] Use HLG + LM rescoring + attention decoder rescoring + 2021-11-10 13:44:10,192 INFO [pretrained.py:425] + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav: AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: - GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED - BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN - ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav: YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION - 2021-10-13 11:30:19,367 INFO [pretrained.py:402] Decoding Done + 2021-11-10 13:44:10,192 INFO [pretrained.py:427] Decoding Done + + +Compute WER with the pre-trained model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To check the WER of the pre-trained model on the test datasets, run: + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/ + $ ln -s pretrained.pt epoch-999.pt + $ cd ../.. + $ ./conformer_ctc/decode.py \ + --exp-dir ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp \ + --lang-dir ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500 \ + --lm-dir ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm \ + --epoch 999 \ + --avg 1 \ + --concatenate-cuts 0 \ + --bucketing-sampler 1 \ + --max-duration 30 \ + --num-paths 1000 \ + --method attention-decoder \ + --nbest-scale 0.5 + Colab notebook -------------- @@ -756,7 +794,7 @@ We do provide a colab notebook for this recipe showing how to use a pre-trained ``HLG decoding + LM rescoring + attention decoder rescoring``. Otherwise, you can only run ``HLG decoding`` with Colab. -**Congratulations!** You have finished the librispeech ASR recipe with +**Congratulations!** You have finished the LibriSpeech ASR recipe with conformer CTC models in ``icefall``. If you want to deploy your trained model in C++, please read the following section. @@ -764,34 +802,14 @@ If you want to deploy your trained model in C++, please read the following secti Deployment with C++ ------------------- -This section describes how to deploy your trained model in C++, without +This section describes how to deploy the pre-trained model in C++, without Python dependencies. -We assume you have run ``./prepare.sh`` and have the following directories available: +.. HINT:: -.. code-block:: bash + At present, it does NOT support streaming decoding. - data - |-- lang_bpe - -Also, we assume your checkpoints are saved in ``conformer_ctc/exp``. - -If you know that averaging 20 checkpoints starting from ``epoch-30.pt`` yields the -lowest WER, you can run the following commands - -.. code-block:: - - $ cd egs/librispeech/ASR - $ ./conformer_ctc/export.py \ - --epoch 30 \ - --avg 20 \ - --jit 1 \ - --lang-dir data/lang_bpe \ - --exp-dir conformer_ctc/exp - -to get a torch scripted model saved in ``conformer_ctc/exp/cpu_jit.pt``. - -Now you have all needed files ready. Let us compile k2 from source: +First, let us compile k2 from source: .. code-block:: bash @@ -809,67 +827,232 @@ Now you have all needed files ready. Let us compile k2 from source: $ mkdir build-release $ cd build-release $ cmake -DCMAKE_BUILD_TYPE=Release .. - $ make -j decode - # You will find an executable: `./bin/decode` + $ make -j ctc_decode hlg_decode ngram_lm_rescore attention_rescore + + # You will find four binaries in `./bin`, i.e., + # ./bin/ctc_decode, ./bin/hlg_decode, + # ./bin/ngram_lm_rescore, and ./bin/attention_rescore Now you are ready to go! -To view the usage of ``./bin/decode``, run: +Assume you have run: + + .. code-block:: bash + + $ cd k2/build-release + $ ln -s /path/to/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 ./ + +To view the usage of ``./bin/ctc_decode``, run: .. code-block:: - $ ./bin/decode + $ ./bin/ctc_decode It will show you the following message: -.. code-block:: +.. code-block:: bash - Please provide --jit_pt + Please provide --nn_model - (1) CTC decoding - ./bin/decode \ - --use_ctc_decoding true \ - --jit_pt \ - --bpe_model \ - /path/to/foo.wav \ - /path/to/bar.wav \ - - (2) HLG decoding - ./bin/decode \ - --use_ctc_decoding false \ - --jit_pt \ - --hlg \ - --word-table \ - /path/to/foo.wav \ - /path/to/bar.wav \ - + This file implements decoding with a CTC topology, without any + kinds of LM or lexicons. - --use_gpu false to use CPU - --use_gpu true to use GPU + Usage: + ./bin/ctc_decode \ + --use_gpu true \ + --nn_model \ + --bpe_model \ + \ + \ + + + To see all possible options, use + ./bin/ctc_decode --help + + Caution: + - Only sound files (*.wav) with single channel are supported. + - It assumes the model is conformer_ctc/transformer.py from icefall. + If you use a different model, you have to change the code + related to `model.forward` in this file. -``./bin/decode`` supports two types of decoding at present: CTC decoding and HLG decoding. CTC decoding ^^^^^^^^^^^^ -You need to provide: +.. code-block:: bash - - ``--jit_pt``, this is the file generated by ``conformer_ctc/export.py``. You can find it - in ``conformer_ctc/exp/cpu_jit.pt``. - - ``--bpe_model``, this is a sentence piece model generated by ``prepare.sh``. You can find - it in ``data/lang_bpe/bpe.model``. + ./bin/ctc_decode \ + --use_gpu true \ + --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \ + --bpe_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/bpe.model \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav +Its output is: + +.. code-block:: + + 2021-11-10 13:57:55.316 [I] k2/torch/bin/ctc_decode.cu:105:int main(int, char**) Use GPU + 2021-11-10 13:57:55.316 [I] k2/torch/bin/ctc_decode.cu:109:int main(int, char**) Device: cuda:0 + 2021-11-10 13:57:55.316 [I] k2/torch/bin/ctc_decode.cu:118:int main(int, char**) Load wave files + 2021-11-10 13:58:01.221 [I] k2/torch/bin/ctc_decode.cu:125:int main(int, char**) Build Fbank computer + 2021-11-10 13:58:01.222 [I] k2/torch/bin/ctc_decode.cu:136:int main(int, char**) Compute features + 2021-11-10 13:58:01.228 [I] k2/torch/bin/ctc_decode.cu:144:int main(int, char**) Load neural network model + 2021-11-10 13:58:02.19 [I] k2/torch/bin/ctc_decode.cu:159:int main(int, char**) Compute nnet_output + 2021-11-10 13:58:02.543 [I] k2/torch/bin/ctc_decode.cu:174:int main(int, char**) Build CTC topo + 2021-11-10 13:58:02.547 [I] k2/torch/bin/ctc_decode.cu:177:int main(int, char**) Decoding + 2021-11-10 13:58:02.708 [I] k2/torch/bin/ctc_decode.cu:207:int main(int, char**) + Decoding result: + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROFFELS + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION HLG decoding ^^^^^^^^^^^^ -You need to provide: +.. code-block:: bash - - ``--jit_pt``, this is the same file as in CTC decoding. - - ``--hlg``, this file is generated by ``prepare.sh``. You can find it in ``data/lang_bpe/HLG.pt``. - - ``--word-table``, this file is generated by ``prepare.sh``. You can find it in ``data/lang_bpe/words.txt``. + ./bin/hlg_decode \ + --use_gpu true \ + --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \ + --hlg ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \ + --word_table ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav -We do provide a Colab notebook, showing you how to run a torch scripted model in C++. +The output is: + +.. code-block:: + + 2021-11-10 13:59:04.729 [I] k2/torch/bin/hlg_decode.cu:111:int main(int, char**) Use GPU + 2021-11-10 13:59:04.729 [I] k2/torch/bin/hlg_decode.cu:115:int main(int, char**) Device: cuda:0 + 2021-11-10 13:59:04.729 [I] k2/torch/bin/hlg_decode.cu:124:int main(int, char**) Load wave files + 2021-11-10 13:59:10.702 [I] k2/torch/bin/hlg_decode.cu:131:int main(int, char**) Build Fbank computer + 2021-11-10 13:59:10.703 [I] k2/torch/bin/hlg_decode.cu:142:int main(int, char**) Compute features + 2021-11-10 13:59:10.707 [I] k2/torch/bin/hlg_decode.cu:150:int main(int, char**) Load neural network model + 2021-11-10 13:59:11.545 [I] k2/torch/bin/hlg_decode.cu:165:int main(int, char**) Compute nnet_output + 2021-11-10 13:59:12.72 [I] k2/torch/bin/hlg_decode.cu:180:int main(int, char**) Load ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt + 2021-11-10 13:59:12.994 [I] k2/torch/bin/hlg_decode.cu:185:int main(int, char**) Decoding + 2021-11-10 13:59:13.268 [I] k2/torch/bin/hlg_decode.cu:216:int main(int, char**) + Decoding result: + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + + +HLG decoding + n-gram LM rescoring +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + ./bin/ngram_lm_rescore \ + --use_gpu true \ + --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \ + --hlg ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \ + --g ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \ + --ngram_lm_scale 1.0 \ + --word_table ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + +The output is: + +.. code-block:: + + 2021-11-10 14:00:55.279 [I] k2/torch/bin/ngram_lm_rescore.cu:122:int main(int, char**) Use GPU + 2021-11-10 14:00:55.280 [I] k2/torch/bin/ngram_lm_rescore.cu:126:int main(int, char**) Device: cuda:0 + 2021-11-10 14:00:55.280 [I] k2/torch/bin/ngram_lm_rescore.cu:135:int main(int, char**) Load wave files + 2021-11-10 14:01:01.214 [I] k2/torch/bin/ngram_lm_rescore.cu:142:int main(int, char**) Build Fbank computer + 2021-11-10 14:01:01.215 [I] k2/torch/bin/ngram_lm_rescore.cu:153:int main(int, char**) Compute features + 2021-11-10 14:01:01.219 [I] k2/torch/bin/ngram_lm_rescore.cu:161:int main(int, char**) Load neural network model + 2021-11-10 14:01:01.945 [I] k2/torch/bin/ngram_lm_rescore.cu:176:int main(int, char**) Compute nnet_output + 2021-11-10 14:01:02.475 [I] k2/torch/bin/ngram_lm_rescore.cu:191:int main(int, char**) Load ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt + 2021-11-10 14:01:03.398 [I] k2/torch/bin/ngram_lm_rescore.cu:199:int main(int, char**) Decoding + 2021-11-10 14:01:03.515 [I] k2/torch/bin/ngram_lm_rescore.cu:205:int main(int, char**) Load n-gram LM: ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt + 2021-11-10 14:01:07.432 [W] k2/torch/csrc/deserialization.cu:441:k2::FsaClass k2::LoadFsa(const string&, c10::optional) + Ignore non tensor attribute: 'dummy' of type: Int + 2021-11-10 14:01:07.589 [I] k2/torch/bin/ngram_lm_rescore.cu:214:int main(int, char**) Rescore with an n-gram LM + 2021-11-10 14:01:08.68 [I] k2/torch/bin/ngram_lm_rescore.cu:242:int main(int, char**) + Decoding result: + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + + +HLG decoding + n-gram LM rescoring + attention decoder rescoring +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + ./bin/attention_rescore \ + --use_gpu true \ + --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \ + --hlg ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \ + --g ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \ + --ngram_lm_scale 2.0 \ + --attention_scale 2.0 \ + --num_paths 100 \ + --nbest_scale 0.5 \ + --word_table ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \ + --sos_id 1 \ + --eos_id 1 \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + +The output is: + +.. code-block:: + + 2021-11-10 14:02:43.656 [I] k2/torch/bin/attention_rescore.cu:149:int main(int, char**) Use GPU + 2021-11-10 14:02:43.656 [I] k2/torch/bin/attention_rescore.cu:153:int main(int, char**) Device: cuda:0 + 2021-11-10 14:02:43.656 [I] k2/torch/bin/attention_rescore.cu:162:int main(int, char**) Load wave files + 2021-11-10 14:02:49.216 [I] k2/torch/bin/attention_rescore.cu:169:int main(int, char**) Build Fbank computer + 2021-11-10 14:02:49.217 [I] k2/torch/bin/attention_rescore.cu:180:int main(int, char**) Compute features + 2021-11-10 14:02:49.222 [I] k2/torch/bin/attention_rescore.cu:188:int main(int, char**) Load neural network model + 2021-11-10 14:02:49.984 [I] k2/torch/bin/attention_rescore.cu:203:int main(int, char**) Compute nnet_output + 2021-11-10 14:02:50.624 [I] k2/torch/bin/attention_rescore.cu:220:int main(int, char**) Load ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt + 2021-11-10 14:02:51.519 [I] k2/torch/bin/attention_rescore.cu:228:int main(int, char**) Decoding + 2021-11-10 14:02:51.632 [I] k2/torch/bin/attention_rescore.cu:234:int main(int, char**) Load n-gram LM: ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt + 2021-11-10 14:02:55.537 [W] k2/torch/csrc/deserialization.cu:441:k2::FsaClass k2::LoadFsa(const string&, c10::optional) Ignore non tensor attribute: 'dummy' of type: Int + 2021-11-10 14:02:55.645 [I] k2/torch/bin/attention_rescore.cu:243:int main(int, char**) Rescore with an n-gram LM + 2021-11-10 14:02:55.970 [I] k2/torch/bin/attention_rescore.cu:246:int main(int, char**) Sample 100 paths + 2021-11-10 14:02:56.215 [I] k2/torch/bin/attention_rescore.cu:293:int main(int, char**) Run attention decoder + 2021-11-10 14:02:57.35 [I] k2/torch/bin/attention_rescore.cu:303:int main(int, char**) Rescoring + 2021-11-10 14:02:57.179 [I] k2/torch/bin/attention_rescore.cu:369:int main(int, char**) + Decoding result: + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + +There is a Colab notebook showing you how to run a torch scripted model in C++. Please see |librispeech asr conformer ctc torch script colab notebook| .. |librispeech asr conformer ctc torch script colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index eb679b951..8d7c867c0 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,6 +1,65 @@ ## Results ### LibriSpeech BPE training results (Conformer-CTC) + +#### 2021-11-09 + +The best WER, as of 2021-11-09, for the librispeech test dataset is below +(using HLG decoding + n-gram LM rescoring + attention decoder rescoring): + +| | test-clean | test-other | +|-----|------------|------------| +| WER | 2.42 | 5.73 | + +Scale values used in n-gram LM rescoring and attention rescoring for the best WERs are: +| ngram_lm_scale | attention_scale | +|----------------|-----------------| +| 2.0 | 2.0 | + + +To reproduce the above result, use the following commands for training: + +``` +cd egs/librispeech/ASR/conformer_ctc +./prepare.sh +export CUDA_VISIBLE_DEVICES="0,1,2,3" +./conformer_ctc/train.py \ + --exp-dir conformer_ctc/exp_500_att0.8 \ + --lang-dir data/lang_bpe_500 \ + --att-rate 0.8 \ + --full-libri 1 \ + --max-duration 200 \ + --concatenate-cuts 0 \ + --world-size 4 \ + --bucketing-sampler 1 \ + --start-epoch 0 \ + --num-epochs 90 +# Note: It trains for 90 epochs, but the best WER is at epoch-77.pt +``` + +and the following command for decoding + +``` +./conformer_ctc/decode.py \ + --exp-dir conformer_ctc/exp_500_att0.8 \ + --lang-dir data/lang_bpe_500 \ + --max-duration 30 \ + --concatenate-cuts 0 \ + --bucketing-sampler 1 \ + --num-paths 1000 \ + --epoch 77 \ + --avg 55 \ + --method attention-decoder \ + --nbest-scale 0.5 +``` + +You can find the pre-trained model by visiting + + +The tensorboard log for training is available at + + + #### 2021-08-19 (Wei Kang): Result of https://github.com/k2-fsa/icefall/pull/13 diff --git a/egs/librispeech/ASR/conformer_ctc/ali.py b/egs/librispeech/ASR/conformer_ctc/ali.py index 3d817a8f6..ad72a88e7 100755 --- a/egs/librispeech/ASR/conformer_ctc/ali.py +++ b/egs/librispeech/ASR/conformer_ctc/ali.py @@ -63,7 +63,7 @@ def get_parser(): parser.add_argument( "--lang-dir", type=str, - default="data/lang_bpe_5000", + default="data/lang_bpe_500", help="The lang dir", ) diff --git a/egs/librispeech/ASR/conformer_ctc/decode.py b/egs/librispeech/ASR/conformer_ctc/decode.py index c5ae3ad7d..e687c5bc4 100755 --- a/egs/librispeech/ASR/conformer_ctc/decode.py +++ b/egs/librispeech/ASR/conformer_ctc/decode.py @@ -143,17 +143,25 @@ def get_parser(): parser.add_argument( "--lang-dir", type=str, - default="data/lang_bpe_5000", + default="data/lang_bpe_500", help="The lang dir", ) + parser.add_argument( + "--lm-dir", + type=str, + default="data/lm", + help="""The LM dir. + It should contain either G_4_gram.pt or G_4_gram.fst.txt + """, + ) + return parser def get_params() -> AttributeDict: params = AttributeDict( { - "lm_dir": Path("data/lm"), # parameters for conformer "subsampling_factor": 4, "vgg_frontend": False, @@ -532,6 +540,7 @@ def main(): args = parser.parse_args() args.exp_dir = Path(args.exp_dir) args.lang_dir = Path(args.lang_dir) + args.lm_dir = Path(args.lm_dir) params = get_params() params.update(vars(args)) @@ -572,9 +581,8 @@ def main(): H = None bpe_model = None HLG = k2.Fsa.from_dict( - torch.load(f"{params.lang_dir}/HLG.pt", map_location="cpu") + torch.load(f"{params.lang_dir}/HLG.pt", map_location=device) ) - HLG = HLG.to(device) assert HLG.requires_grad is False if not hasattr(HLG, "lm_scores"): @@ -601,11 +609,16 @@ def main(): G.labels[G.labels >= first_word_disambig_id] = 0 G = k2.Fsa.from_fsas([G]).to(device) G = k2.arc_sort(G) + # Save a dummy value so that it can be loaded in C++. + # See https://github.com/pytorch/pytorch/issues/67902 + # for why we need to do this. + G["dummy"] = 1 + torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt") else: logging.info("Loading pre-compiled G_4_gram.pt") - d = torch.load(params.lm_dir / "G_4_gram.pt", map_location="cpu") - G = k2.Fsa.from_dict(d).to(device) + d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device) + G = k2.Fsa.from_dict(d) if params.method in ["whole-lattice-rescoring", "attention-decoder"]: # Add epsilon self-loops to G as we will compose diff --git a/egs/librispeech/ASR/conformer_ctc/export.py b/egs/librispeech/ASR/conformer_ctc/export.py index 79e026dac..28c28df01 100755 --- a/egs/librispeech/ASR/conformer_ctc/export.py +++ b/egs/librispeech/ASR/conformer_ctc/export.py @@ -65,7 +65,7 @@ def get_parser(): parser.add_argument( "--lang-dir", type=str, - default="data/lang_bpe_5000", + default="data/lang_bpe_500", help="""It contains language related input files such as "lexicon.txt" """, ) diff --git a/egs/librispeech/ASR/conformer_ctc/pretrained.py b/egs/librispeech/ASR/conformer_ctc/pretrained.py index beed6f73b..95589b82b 100755 --- a/egs/librispeech/ASR/conformer_ctc/pretrained.py +++ b/egs/librispeech/ASR/conformer_ctc/pretrained.py @@ -169,7 +169,7 @@ def get_parser(): parser.add_argument( "--num-classes", type=int, - default=5000, + default=500, help=""" Vocab size in the BPE model. """, diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index 1384204dd..46ea5c60c 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -81,7 +81,7 @@ def get_parser(): parser.add_argument( "--num-epochs", type=int, - default=35, + default=78, help="Number of epochs to train.", ) @@ -108,13 +108,22 @@ def get_parser(): parser.add_argument( "--lang-dir", type=str, - default="data/lang_bpe_5000", + default="data/lang_bpe_500", help="""The lang dir It contains language related input files such as "lexicon.txt" """, ) + parser.add_argument( + "--att-rate", + type=float, + default=0.8, + help="""The attention rate. + The total loss is (1 - att_rate) * ctc_loss + att_rate * att_loss + """, + ) + return parser @@ -198,7 +207,6 @@ def get_params() -> AttributeDict: "beam_size": 10, "reduction": "sum", "use_double_scores": True, - "att_rate": 0.7, # parameters for Noam "weight_decay": 1e-6, "lr_factor": 5.0, diff --git a/egs/librispeech/ASR/conformer_ctc/transformer.py b/egs/librispeech/ASR/conformer_ctc/transformer.py index 3e6abb695..c9666362f 100644 --- a/egs/librispeech/ASR/conformer_ctc/transformer.py +++ b/egs/librispeech/ASR/conformer_ctc/transformer.py @@ -311,7 +311,7 @@ class Transformer(nn.Module): self, memory: torch.Tensor, memory_key_padding_mask: torch.Tensor, - token_ids: List[List[int]], + token_ids: List[torch.Tensor], sos_id: int, eos_id: int, ) -> torch.Tensor: @@ -334,6 +334,11 @@ class Transformer(nn.Module): """ # The common part between this function and decoder_forward could be # extracted as a separate function. + if isinstance(token_ids[0], torch.Tensor): + # This branch is executed by torchscript in C++. + # See https://github.com/k2-fsa/k2/pull/870 + # https://github.com/k2-fsa/k2/blob/3c1c18400060415b141ccea0115fd4bf0ad6234e/k2/torch/bin/attention_rescore.cu#L286 + token_ids = [tolist(t) for t in token_ids] ys_in = add_sos(token_ids, sos_id=sos_id) ys_in = [torch.tensor(y) for y in ys_in] @@ -660,7 +665,7 @@ class PositionalEncoding(nn.Module): self.xscale = math.sqrt(self.d_model) self.dropout = nn.Dropout(p=dropout) # not doing: self.pe = None because of errors thrown by torchscript - self.pe = torch.zeros(0, 0, dtype=torch.float32) + self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32) def extend_pe(self, x: torch.Tensor) -> None: """Extend the time t in the positional encoding if required. @@ -1000,3 +1005,8 @@ def add_eos(token_ids: List[List[int]], eos_id: int) -> List[List[int]]: with EOS ID. """ return [utt + [eos_id] for utt in token_ids] + + +def tolist(t: torch.Tensor) -> List[int]: + """Used by jit""" + return torch.jit.annotate(List[int], t.tolist()) diff --git a/icefall/decode.py b/icefall/decode.py index 8b7bdd27f..98f792783 100644 --- a/icefall/decode.py +++ b/icefall/decode.py @@ -364,23 +364,13 @@ class Nbest(object): Return a ragged tensor with 2 axes [utt][path_scores]. Its dtype is torch.float64. """ - # Caution: We need a clone here. `self.fsa.scores` is a - # reference to a tensor representing the last field of an arc - # in the FSA (Remeber that an arc has four fields.) If we later assign - # `self.fsa.scores`, it will also change the scores on every arc, which - # means saved_scores will also be changed if we don't use `clone()` - # here. - saved_scores = self.fsa.scores.clone() + scores_shape = self.fsa.arcs.shape().remove_axis(1) + # scores_shape has axes [path][arc] + am_scores = self.fsa.scores - self.fsa.lm_scores + ragged_am_scores = k2.RaggedTensor(scores_shape, am_scores.contiguous()) + tot_scores = ragged_am_scores.sum() - # The `scores` of every arc consists of `am_scores` and `lm_scores` - self.fsa.scores = self.fsa.scores - self.fsa.lm_scores - - am_scores = self.fsa.get_tot_scores( - use_double_scores=True, log_semiring=False - ) - self.fsa.scores = saved_scores - - return k2.RaggedTensor(self.shape, am_scores) + return k2.RaggedTensor(self.shape, tot_scores) def compute_lm_scores(self) -> k2.RaggedTensor: """Compute LM scores of each linear FSA (i.e., each path within @@ -397,17 +387,16 @@ class Nbest(object): Return a ragged tensor with 2 axes [utt][path_scores]. Its dtype is torch.float64. """ - saved_scores = self.fsa.scores.clone() + scores_shape = self.fsa.arcs.shape().remove_axis(1) + # scores_shape has axes [path][arc] - # The `scores` of every arc consists of `am_scores` and `lm_scores` - self.fsa.scores = self.fsa.lm_scores.clone() - - lm_scores = self.fsa.get_tot_scores( - use_double_scores=True, log_semiring=False + ragged_lm_scores = k2.RaggedTensor( + scores_shape, self.fsa.lm_scores.contiguous() ) - self.fsa.scores = saved_scores - return k2.RaggedTensor(self.shape, lm_scores) + tot_scores = ragged_lm_scores.sum() + + return k2.RaggedTensor(self.shape, tot_scores) def tot_scores(self) -> k2.RaggedTensor: """Get total scores of FSAs in this Nbest. @@ -420,10 +409,14 @@ class Nbest(object): Return a ragged tensor with two axes [utt][path_scores]. Its dtype is torch.float64. """ - scores = self.fsa.get_tot_scores( - use_double_scores=True, log_semiring=False - ) - return k2.RaggedTensor(self.shape, scores) + scores_shape = self.fsa.arcs.shape().remove_axis(1) + # scores_shape has axes [path][arc] + + ragged_scores = k2.RaggedTensor(scores_shape, self.scores.contiguous()) + + tot_scores = ragged_scores.sum() + + return k2.RaggedTensor(self.shape, tot_scores) def build_levenshtein_graphs(self) -> k2.Fsa: """Return an FsaVec with axes [utt][state][arc]."""