diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..3abb38f8b --- /dev/null +++ b/docs/README.md @@ -0,0 +1,24 @@ + +## Usage + +```bash +cd /path/to/icefall/docs +pip install -r requirements.txt +make clean +make html +cd build/html +python3 -m http.server 8000 +``` + +It prints: + +``` +Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ... +``` + +Open your browser and go to to view the generated +documentation. + +Done! + +**Hint**: You can change the port number when starting the server. diff --git a/docs/source/conf.py b/docs/source/conf.py index 221d9d734..ef9fe1445 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -78,3 +78,12 @@ html_context = { } todo_include_todos = True + +rst_epilog = """ +.. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn +.. _icefall: https://github.com/k2-fsa/icefall +.. _git-lfs: https://git-lfs.com/ +.. _ncnn: https://github.com/tencent/ncnn +.. _LibriSpeech: https://www.openslr.org/12 +.. _musan: http://www.openslr.org/17/ +""" diff --git a/docs/source/faqs.rst b/docs/source/faqs.rst new file mode 100644 index 000000000..72b0302d7 --- /dev/null +++ b/docs/source/faqs.rst @@ -0,0 +1,107 @@ +Frequently Asked Questions (FAQs) +================================= + +In this section, we collect issues reported by users and post the corresponding +solutions. + + +OSError: libtorch_hip.so: cannot open shared object file: no such file or directory +----------------------------------------------------------------------------------- + +One user is using the following code to install ``torch`` and ``torchaudio``: + +.. code-block:: bash + + pip install \ + torch==1.10.0+cu111 \ + torchvision==0.11.0+cu111 \ + torchaudio==0.10.0 \ + -f https://download.pytorch.org/whl/torch_stable.html + +and it throws the following error when running ``tdnn/train.py``: + +.. code-block:: + + OSError: libtorch_hip.so: cannot open shared object file: no such file or directory + +The fix is to specify the CUDA version while installing ``torchaudio``. That +is, change ``torchaudio==0.10.0`` to ``torchaudio==0.10.0+cu11```. Therefore, +the correct command is: + +.. code-block:: bash + + pip install \ + torch==1.10.0+cu111 \ + torchvision==0.11.0+cu111 \ + torchaudio==0.10.0+cu111 \ + -f https://download.pytorch.org/whl/torch_stable.html + +AttributeError: module 'distutils' has no attribute 'version' +------------------------------------------------------------- + +The error log is: + +.. code-block:: + + Traceback (most recent call last): + File "./tdnn/train.py", line 14, in + from asr_datamodule import YesNoAsrDataModule + File "/home/xxx/code/next-gen-kaldi/icefall/egs/yesno/ASR/tdnn/asr_datamodule.py", line 34, in + from icefall.dataset.datamodule import DataModule + File "/home/xxx/code/next-gen-kaldi/icefall/icefall/__init__.py", line 3, in + from . import ( + File "/home/xxx/code/next-gen-kaldi/icefall/icefall/decode.py", line 23, in + from icefall.utils import add_eos, add_sos, get_texts + File "/home/xxx/code/next-gen-kaldi/icefall/icefall/utils.py", line 39, in + from torch.utils.tensorboard import SummaryWriter + File "/home/xxx/tool/miniconda3/envs/yyy/lib/python3.8/site-packages/torch/utils/tensorboard/__init__.py", line 4, in + LooseVersion = distutils.version.LooseVersion + AttributeError: module 'distutils' has no attribute 'version' + +The fix is: + +.. code-block:: bash + + pip uninstall setuptools + + pip install setuptools==58.0.4 + +ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory +-------------------------------------------------------------------------------------------- + +If you are using ``conda`` and encounter the following issue: + +.. code-block:: + + Traceback (most recent call last): + File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 24, in + from _k2 import DeterminizeWeightPushingType + ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory + + During handling of the above exception, another exception occurred: + + Traceback (most recent call last): + File "/k2-dev/yangyifan/icefall/egs/librispeech/ASR/./pruned_transducer_stateless7_ctc_bs/decode.py", line 104, in + import k2 + File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 30, in + raise ImportError( + ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory + Note: If you're using anaconda and importing k2 on MacOS, + you can probably fix this by setting the environment variable: + export DYLD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages:$DYLD_LIBRARY_PATH + +Please first try to find where ``libpython3.10.so.1.0`` locates. + +For instance, + +.. code-block:: bash + + cd $CONDA_PREFIX/lib + find . -name "libpython*" + +If you are able to find it inside ``$CODNA_PREFIX/lib``, please set the +following environment variable: + +.. code-block:: bash + + export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH diff --git a/docs/source/index.rst b/docs/source/index.rst index 4ea446259..8d76eb68b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -21,6 +21,7 @@ speech recognition recipes using `k2 `_. :caption: Contents: installation/index + faqs model-export/index .. toctree:: diff --git a/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt b/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt new file mode 100644 index 000000000..ecbdd4b31 --- /dev/null +++ b/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt @@ -0,0 +1,21 @@ +2023-01-11 12:15:38,677 INFO [export-for-ncnn.py:220] device: cpu +2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:229] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_v +alid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampl +ing_factor': 4, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.2', 'k2-build-type': +'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'a34171ed85605b0926eebbd0463d059431f4f74a', 'k2-git-date': 'Wed Dec 14 00:06:38 2022', + 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-vers +ion': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'fix-stateless3-train-2022-12-27', 'icefall-git-sha1': '530e8a1-dirty', ' +icefall-git-date': 'Tue Dec 27 13:59:18 2022', 'icefall-path': '/star-fj/fangjun/open-source/icefall', 'k2-path': '/star-fj/fangjun/op +en-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279 +-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '127.0.0.1'}, 'epoch': 30, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefa +ll-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp'), 'bpe_model': './icefall-asr-librispeech-conv-emformer-transdu +cer-stateless2-2022-07-05//data/lang_bpe_500/bpe.model', 'jit': False, 'context_size': 2, 'use_averaged_model': False, 'encoder_dim': +512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'cnn_module_kernel': 31, 'left_context_length': 32, 'chunk_length' +: 32, 'right_context_length': 8, 'memory_size': 32, 'blank_id': 0, 'vocab_size': 500} +2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:231] About to create model +2023-01-11 12:15:40,053 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-conv-emformer-transducer-stateless2-2 +022-07-05/exp/epoch-30.pt +2023-01-11 12:15:40,708 INFO [export-for-ncnn.py:315] Number of model parameters: 75490012 +2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:318] Using torch.jit.trace() +2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:320] Exporting encoder +2023-01-11 12:15:41,682 INFO [export-for-ncnn.py:149] chunk_length: 32, right_context_length: 8 diff --git a/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt b/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt new file mode 100644 index 000000000..347e7e51a --- /dev/null +++ b/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt @@ -0,0 +1,104 @@ +Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1 +num encoder conv layers: 88 +num joiner conv layers: 3 +num files: 3 +Processing ../test_wavs/1089-134686-0001.wav +Processing ../test_wavs/1221-135766-0001.wav +Processing ../test_wavs/1221-135766-0002.wav +Processing ../test_wavs/1089-134686-0001.wav +Processing ../test_wavs/1221-135766-0001.wav +Processing ../test_wavs/1221-135766-0002.wav +----------encoder---------- +conv_87 : max = 15.942385 threshold = 15.938493 scale = 7.968131 +conv_88 : max = 35.442448 threshold = 15.549335 scale = 8.167552 +conv_89 : max = 23.228289 threshold = 8.001738 scale = 15.871552 +linear_90 : max = 3.976146 threshold = 1.101789 scale = 115.267128 +linear_91 : max = 6.962030 threshold = 5.162033 scale = 24.602713 +linear_92 : max = 12.323041 threshold = 3.853959 scale = 32.953129 +linear_94 : max = 6.905416 threshold = 4.648006 scale = 27.323545 +linear_93 : max = 6.905416 threshold = 5.474093 scale = 23.200188 +linear_95 : max = 1.888012 threshold = 1.403563 scale = 90.483986 +linear_96 : max = 6.856741 threshold = 5.398679 scale = 23.524273 +linear_97 : max = 9.635942 threshold = 2.613655 scale = 48.590950 +linear_98 : max = 6.460340 threshold = 5.670146 scale = 22.398010 +linear_99 : max = 9.532276 threshold = 2.585537 scale = 49.119396 +linear_101 : max = 6.585871 threshold = 5.719224 scale = 22.205809 +linear_100 : max = 6.585871 threshold = 5.751382 scale = 22.081648 +linear_102 : max = 1.593344 threshold = 1.450581 scale = 87.551147 +linear_103 : max = 6.592681 threshold = 5.705824 scale = 22.257959 +linear_104 : max = 8.752957 threshold = 1.980955 scale = 64.110489 +linear_105 : max = 6.696240 threshold = 5.877193 scale = 21.608953 +linear_106 : max = 9.059659 threshold = 2.643138 scale = 48.048950 +linear_108 : max = 6.975461 threshold = 4.589567 scale = 27.671457 +linear_107 : max = 6.975461 threshold = 6.190381 scale = 20.515701 +linear_109 : max = 3.710759 threshold = 2.305635 scale = 55.082436 +linear_110 : max = 7.531228 threshold = 5.731162 scale = 22.159557 +linear_111 : max = 10.528083 threshold = 2.259322 scale = 56.211544 +linear_112 : max = 8.148807 threshold = 5.500842 scale = 23.087374 +linear_113 : max = 8.592566 threshold = 1.948851 scale = 65.166611 +linear_115 : max = 8.437109 threshold = 5.608947 scale = 22.642395 +linear_114 : max = 8.437109 threshold = 6.193942 scale = 20.503904 +linear_116 : max = 3.966980 threshold = 3.200896 scale = 39.676392 +linear_117 : max = 9.451303 threshold = 6.061664 scale = 20.951344 +linear_118 : max = 12.077262 threshold = 3.965800 scale = 32.023804 +linear_119 : max = 9.671615 threshold = 4.847613 scale = 26.198460 +linear_120 : max = 8.625638 threshold = 3.131427 scale = 40.556595 +linear_122 : max = 10.274080 threshold = 4.888716 scale = 25.978189 +linear_121 : max = 10.274080 threshold = 5.420480 scale = 23.429659 +linear_123 : max = 4.826197 threshold = 3.599617 scale = 35.281532 +linear_124 : max = 11.396383 threshold = 7.325849 scale = 17.335875 +linear_125 : max = 9.337198 threshold = 3.941410 scale = 32.221970 +linear_126 : max = 9.699965 threshold = 4.842878 scale = 26.224073 +linear_127 : max = 8.775370 threshold = 3.884215 scale = 32.696438 +linear_129 : max = 9.872276 threshold = 4.837319 scale = 26.254213 +linear_128 : max = 9.872276 threshold = 7.180057 scale = 17.687883 +linear_130 : max = 4.150427 threshold = 3.454298 scale = 36.765789 +linear_131 : max = 11.112692 threshold = 7.924847 scale = 16.025545 +linear_132 : max = 11.852893 threshold = 3.116593 scale = 40.749626 +linear_133 : max = 11.517084 threshold = 5.024665 scale = 25.275314 +linear_134 : max = 10.683807 threshold = 3.878618 scale = 32.743618 +linear_136 : max = 12.421055 threshold = 6.322729 scale = 20.086264 +linear_135 : max = 12.421055 threshold = 5.309880 scale = 23.917679 +linear_137 : max = 4.827781 threshold = 3.744595 scale = 33.915554 +linear_138 : max = 14.422395 threshold = 7.742882 scale = 16.402161 +linear_139 : max = 8.527538 threshold = 3.866123 scale = 32.849449 +linear_140 : max = 12.128619 threshold = 4.657793 scale = 27.266134 +linear_141 : max = 9.839593 threshold = 3.845993 scale = 33.021378 +linear_143 : max = 12.442304 threshold = 7.099039 scale = 17.889746 +linear_142 : max = 12.442304 threshold = 5.325038 scale = 23.849592 +linear_144 : max = 5.929444 threshold = 5.618206 scale = 22.605080 +linear_145 : max = 13.382126 threshold = 9.321095 scale = 13.625010 +linear_146 : max = 9.894987 threshold = 3.867645 scale = 32.836517 +linear_147 : max = 10.915313 threshold = 4.906028 scale = 25.886522 +linear_148 : max = 9.614287 threshold = 3.908151 scale = 32.496181 +linear_150 : max = 11.724932 threshold = 4.485588 scale = 28.312899 +linear_149 : max = 11.724932 threshold = 5.161146 scale = 24.606939 +linear_151 : max = 7.164453 threshold = 5.847355 scale = 21.719223 +linear_152 : max = 13.086471 threshold = 5.984121 scale = 21.222834 +linear_153 : max = 11.099524 threshold = 3.991601 scale = 31.816805 +linear_154 : max = 10.054585 threshold = 4.489706 scale = 28.286930 +linear_155 : max = 12.389185 threshold = 3.100321 scale = 40.963501 +linear_157 : max = 9.982999 threshold = 5.154796 scale = 24.637253 +linear_156 : max = 9.982999 threshold = 8.537706 scale = 14.875190 +linear_158 : max = 8.420287 threshold = 6.502287 scale = 19.531588 +linear_159 : max = 25.014746 threshold = 9.423280 scale = 13.477261 +linear_160 : max = 45.633553 threshold = 5.715335 scale = 22.220921 +linear_161 : max = 20.371849 threshold = 5.117830 scale = 24.815203 +linear_162 : max = 12.492933 threshold = 3.126283 scale = 40.623318 +linear_164 : max = 20.697504 threshold = 4.825712 scale = 26.317358 +linear_163 : max = 20.697504 threshold = 5.078367 scale = 25.008038 +linear_165 : max = 9.023975 threshold = 6.836278 scale = 18.577358 +linear_166 : max = 34.860619 threshold = 7.259792 scale = 17.493614 +linear_167 : max = 30.380934 threshold = 5.496160 scale = 23.107042 +linear_168 : max = 20.691216 threshold = 4.733317 scale = 26.831076 +linear_169 : max = 9.723948 threshold = 3.952728 scale = 32.129707 +linear_171 : max = 21.034811 threshold = 5.366547 scale = 23.665123 +linear_170 : max = 21.034811 threshold = 5.356277 scale = 23.710501 +linear_172 : max = 10.556884 threshold = 5.729481 scale = 22.166058 +linear_173 : max = 20.033039 threshold = 10.207264 scale = 12.442120 +linear_174 : max = 11.597379 threshold = 2.658676 scale = 47.768131 +----------joiner---------- +linear_2 : max = 19.293503 threshold = 14.305265 scale = 8.877850 +linear_1 : max = 10.812222 threshold = 8.766452 scale = 14.487047 +linear_3 : max = 0.999999 threshold = 0.999755 scale = 127.031174 +ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233... diff --git a/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt b/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt new file mode 100644 index 000000000..114fe7342 --- /dev/null +++ b/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt @@ -0,0 +1,7 @@ +2023-01-11 14:02:12,216 INFO [streaming-ncnn-decode.py:320] {'tokens': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav'} +T 51 32 +2023-01-11 14:02:13,141 INFO [streaming-ncnn-decode.py:328] Constructing Fbank computer +2023-01-11 14:02:13,151 INFO [streaming-ncnn-decode.py:331] Reading sound files: ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav +2023-01-11 14:02:13,176 INFO [streaming-ncnn-decode.py:336] torch.Size([106000]) +2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:380] ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav +2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:381] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS diff --git a/docs/source/model-export/export-ncnn.rst b/docs/source/model-export/export-ncnn.rst index 3dbb8b514..ed0264089 100644 --- a/docs/source/model-export/export-ncnn.rst +++ b/docs/source/model-export/export-ncnn.rst @@ -1,12 +1,771 @@ Export to ncnn ============== -We support exporting LSTM transducer models to `ncnn `_. - -Please refer to :ref:`export-model-for-ncnn` for details. +We support exporting both +`LSTM transducer models `_ +and +`ConvEmformer transducer models `_ +to `ncnn `_. We also provide ``_ performing speech recognition using ``ncnn`` with exported models. -It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is -self-contained and can be statically linked to produce a binary containing -everything needed. +It has been tested on Linux, macOS, Windows, ``Android``, and ``Raspberry Pi``. + +`sherpa-ncnn`_ is self-contained and can be statically linked to produce +a binary containing everything needed. Please refer +to its documentation for details: + + - ``_ + + +Export LSTM transducer models +----------------------------- + +Please refer to :ref:`export-lstm-transducer-model-for-ncnn` for details. + + + +Export ConvEmformer transducer models +------------------------------------- + +We use the pre-trained model from the following repository as an example: + + - ``_ + +We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_. + +.. hint:: + + We use ``Ubuntu 18.04``, ``torch 1.10``, and ``Python 3.8`` for testing. + +.. caution:: + + Please use a more recent version of PyTorch. For instance, ``torch 1.8`` + may ``not`` work. + +1. Download the pre-trained model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. hint:: + + You can also refer to ``_ to download the pre-trained model. + + You have to install `git-lfs`_ before you continue. + +.. code-block:: bash + + cd egs/librispeech/ASR + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + + git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + + cd .. + +.. note:: + + We download ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``. + + +In the above code, we download the pre-trained model into the directory +``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``. + +2. Install ncnn and pnnx +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + # We put ncnn into $HOME/open-source/ncnn + # You can change it to anywhere you like + + cd $HOME + mkdir -p open-source + cd open-source + + git clone https://github.com/csukuangfj/ncnn + cd ncnn + git submodule update --recursive --init + + # Note: We don't use "python setup.py install" or "pip install ." here + + mkdir -p build-wheel + cd build-wheel + + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DNCNN_PYTHON=ON \ + -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_BUILD_EXAMPLES=OFF \ + -DNCNN_BUILD_TOOLS=ON \ + .. + + make -j4 + + cd .. + + # Note: $PWD here is $HOME/open-source/ncnn + + export PYTHONPATH=$PWD/python:$PYTHONPATH + export PATH=$PWD/tools/pnnx/build/src:$PATH + export PATH=$PWD/build-wheel/tools/quantize:$PATH + + # Now build pnnx + cd tools/pnnx + mkdir build + cd build + cmake .. + make -j4 + + ./src/pnnx + +Congratulations! You have successfully installed the following components: + + - ``pnxx``, which is an executable located in + ``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use + it to convert models exported by ``torch.jit.trace()``. + - ``ncnn2int8``, which is an executable located in + ``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use + it to quantize our models to ``int8``. + - ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located + in ``$HOME/open-source/ncnn/python/ncnn``. + + .. note:: + + I am using ``Python 3.8``, so it + is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different + version, say, ``Python 3.9``, the name would be + ``ncnn.cpython-39-x86_64-linux-gnu.so``. + + Also, if you are not using Linux, the file name would also be different. + But that does not matter. As long as you can compile it, it should work. + +We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your +Python code. We have also set up ``PATH`` so that you can use +``pnnx`` and ``ncnn2int8`` later in your terminal. + +.. caution:: + + Please don't use ``_. + We have made some modifications to the offical `ncnn`_. + + We will synchronize ``_ periodically + with the official one. + +3. Export the model via torch.jit.trace() +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +First, let us rename our pre-trained model: + +.. code-block:: + + cd egs/librispeech/ASR + + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp + + ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt + + cd ../.. + +Next, we use the following code to export our model: + +.. code-block:: bash + + dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/ + + ./conv_emformer_transducer_stateless2/export-for-ncnn.py \ + --exp-dir $dir/exp \ + --bpe-model $dir/data/lang_bpe_500/bpe.model \ + --epoch 30 \ + --avg 1 \ + --use-averaged-model 0 \ + \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --encoder-dim 512 + +.. hint:: + + We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``. + There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``. + + If you have trained a model by yourself and if you have all checkpoints + available, please first use ``decode.py`` to tune ``--epoch --avg`` + and select the best combination with with ``--use-averaged-model 1``. + +.. note:: + + You will see the following log output: + + .. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt + + The log shows the model has ``75490012`` parameters, i.e., ``~75 M``. + + .. code-block:: + + ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt + + -rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt + + You can see that the file size of the pre-trained model is ``289 MB``, which + is roughly ``75490012*4/1024/1024 = 287.97 MB``. + +After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``, +we will get the following files: + +.. code-block:: bash + + ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx* + + -rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt + -rw-r--r-- 1 kuangfangjun root 283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt + -rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt + + +.. _conv-emformer-step-3-export-torchscript-model-via-pnnx: + +3. Export torchscript model via pnnx +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. hint:: + + Make sure you have set up the ``PATH`` environment variable. Otherwise, + it will throw an error saying that ``pnnx`` could not be found. + +Now, it's time to export our models to `ncnn`_ via ``pnnx``. + +.. code-block:: + + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/ + + pnnx ./encoder_jit_trace-pnnx.pt + pnnx ./decoder_jit_trace-pnnx.pt + pnnx ./joiner_jit_trace-pnnx.pt + +It will generate the following files: + +.. code-block:: bash + + ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param} + + -rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 kuangfangjun root 437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param + -rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 kuangfangjun root 79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param + -rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 kuangfangjun root 488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param + +There are two types of files: + +- ``param``: It is a text file containing the model architectures. You can + use a text editor to view its content. +- ``bin``: It is a binary file containing the model parameters. + +We compare the file sizes of the models below before and after converting via ``pnnx``: + +.. see https://tableconvert.com/restructuredtext-generator + ++----------------------------------+------------+ +| File name | File size | ++==================================+============+ +| encoder_jit_trace-pnnx.pt | 283 MB | ++----------------------------------+------------+ +| decoder_jit_trace-pnnx.pt | 1010 KB | ++----------------------------------+------------+ +| joiner_jit_trace-pnnx.pt | 3.0 MB | ++----------------------------------+------------+ +| encoder_jit_trace-pnnx.ncnn.bin | 142 MB | ++----------------------------------+------------+ +| decoder_jit_trace-pnnx.ncnn.bin | 503 KB | ++----------------------------------+------------+ +| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB | ++----------------------------------+------------+ + +You can see that the file sizes of the models after conversion are about one half +of the models before conversion: + + - encoder: 283 MB vs 142 MB + - decoder: 1010 KB vs 503 KB + - joiner: 3.0 MB vs 1.5 MB + +The reason is that by default ``pnnx`` converts ``float32`` parameters +to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes +for ``float16``. Thus, it is ``twice smaller`` after conversion. + +.. hint:: + + If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx`` + won't convert ``float32`` to ``float16``. + +4. Test the exported models in icefall +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + We assume you have set up the environment variable ``PYTHONPATH`` when + building `ncnn`_. + +Now we have successfully converted our pre-trained model to `ncnn`_ format. +The generated 6 files are what we need. You can use the following code to +test the converted models: + +.. code-block:: bash + + ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \ + --tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \ + --encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \ + --encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \ + --decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \ + --decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \ + --joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \ + --joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \ + ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav + +.. hint:: + + `ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts + only 1 wave file as input. + +The output is given below: + +.. literalinclude:: ./code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt + +Congratulations! You have successfully exported a model from PyTorch to `ncnn`_! + + +.. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn: + +5. Modify the exported encoder for sherpa-ncnn +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In order to use the exported models in `sherpa-ncnn`_, we have to modify +``encoder_jit_trace-pnnx.ncnn.param``. + +Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``: + +.. code-block:: + + 7767517 + 1060 1342 + Input in0 0 1 in0 + +**Explanation** of the above three lines: + + 1. ``7767517``, it is a magic number and should not be changed. + 2. ``1060 1342``, the first number ``1060`` specifies the number of layers + in this file, while ``1342`` specifies the number of intermediate outputs + of this file + 3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0`` + is the layer name of this layer; ``0`` means this layer has no input; + ``1`` means this layer has one output; ``in0`` is the output name of + this layer. + +We need to add 1 extra line and also increment the number of layers. +The result looks like below: + +.. code-block:: bash + + 7767517 + 1061 1342 + SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512 + Input in0 0 1 in0 + +**Explanation** + + 1. ``7767517``, it is still the same + 2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``. + We don't need to change ``1342`` since the newly added layer has no inputs or outputs. + 3. ``SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512`` + This line is newly added. Its explanation is given below: + + - ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``. + - ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``. + - ``0 0`` means this layer has no inputs or output. Must be ``0 0`` + - ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1`` + - ``1=12``, 1 is the key and 12 is the value of the + parameter ``--num-encoder-layers`` that you provided when running + ``conv_emformer_transducer_stateless2/export-for-ncnn.py``. + - ``2=32``, 2 is the key and 32 is the value of the + parameter ``--memory-size`` that you provided when running + ``conv_emformer_transducer_stateless2/export-for-ncnn.py``. + - ``3=31``, 3 is the key and 31 is the value of the + parameter ``--cnn-module-kernel`` that you provided when running + ``conv_emformer_transducer_stateless2/export-for-ncnn.py``. + - ``4=8``, 4 is the key and 8 is the value of the + parameter ``--left-context-length`` that you provided when running + ``conv_emformer_transducer_stateless2/export-for-ncnn.py``. + - ``5=32``, 5 is the key and 32 is the value of the + parameter ``--chunk-length`` that you provided when running + ``conv_emformer_transducer_stateless2/export-for-ncnn.py``. + - ``6=8``, 6 is the key and 8 is the value of the + parameter ``--right-context-length`` that you provided when running + ``conv_emformer_transducer_stateless2/export-for-ncnn.py``. + - ``7=512``, 7 is the key and 512 is the value of the + parameter ``--encoder-dim`` that you provided when running + ``conv_emformer_transducer_stateless2/export-for-ncnn.py``. + + For ease of reference, we list the key-value pairs that you need to add + in the following table. If your model has a different setting, please + change the values for ``SherpaMetaData`` accordingly. Otherwise, you + will be ``SAD``. + + +------+-----------------------------+ + | key | value | + +======+=============================+ + | 0 | 1 (fixed) | + +------+-----------------------------+ + | 1 | ``--num-encoder-layers`` | + +------+-----------------------------+ + | 2 | ``--memory-size`` | + +------+-----------------------------+ + | 3 | ``--cnn-module-kernel`` | + +------+-----------------------------+ + | 4 | ``--left-context-length`` | + +------+-----------------------------+ + | 5 | ``--chunk-length`` | + +------+-----------------------------+ + | 6 | ``--right-context-length`` | + +------+-----------------------------+ + | 7 | ``--encoder-dim`` | + +------+-----------------------------+ + + 4. ``Input in0 0 1 in0``. No need to change it. + +.. caution:: + + When you add a new layer ``SherpaMetaData``, please remember to update the + number of layers. In our case, update ``1060`` to ``1061``. Otherwise, + you will be SAD later. + +.. hint:: + + After adding the new layer ``SherpaMetaData``, you cannot use this model + with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is + supported only in `sherpa-ncnn`_. + +.. hint:: + + `ncnn`_ is very flexible. You can add new layers to it just by text-editing + the ``param`` file! You don't need to change the ``bin`` file. + +Now you can use this model in `sherpa-ncnn`_. +Please refer to the following documentation: + + - Linux/macOS/Windows/arm/aarch64: ``_ + - Android: ``_ + - Python: ``_ + +We have a list of pre-trained models that have been exported for `sherpa-ncnn`_: + + - ``_ + + You can find more usages there. + +6. (Optional) int8 quantization with sherpa-ncnn +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This step is optional. + +In this step, we describe how to quantize our model with ``int8``. + +Change :ref:`conv-emformer-step-3-export-torchscript-model-via-pnnx` to +disable ``fp16`` when using ``pnnx``: + +.. code-block:: + + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/ + + pnnx ./encoder_jit_trace-pnnx.pt fp16=0 + pnnx ./decoder_jit_trace-pnnx.pt + pnnx ./joiner_jit_trace-pnnx.pt fp16=0 + +.. note:: + + We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not + support quantizing the decoder model yet. We will update this documentation + once `ncnn`_ supports it. (Maybe in this year, 2023). + +It will generate the following files + +.. code-block:: bash + + ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin} + + -rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 kuangfangjun root 437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param + -rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 kuangfangjun root 79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param + -rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 kuangfangjun root 488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param + +Let us compare again the file sizes: + ++----------------------------------------+------------+ +| File name | File size | ++----------------------------------------+------------+ +| encoder_jit_trace-pnnx.pt | 283 MB | ++----------------------------------------+------------+ +| decoder_jit_trace-pnnx.pt | 1010 KB | ++----------------------------------------+------------+ +| joiner_jit_trace-pnnx.pt | 3.0 MB | ++----------------------------------------+------------+ +| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB | ++----------------------------------------+------------+ +| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB | ++----------------------------------------+------------+ +| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB | ++----------------------------------------+------------+ +| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB | ++----------------------------------------+------------+ +| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB | ++----------------------------------------+------------+ + +You can see that the file sizes are doubled when we disable ``fp16``. + +.. note:: + + You can again use ``streaming-ncnn-decode.py`` to test the exported models. + +Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn` +to modify ``encoder_jit_trace-pnnx.ncnn.param``. + +Change + +.. code-block:: bash + + 7767517 + 1060 1342 + Input in0 0 1 in0 + +to + +.. code-block:: bash + + 7767517 + 1061 1342 + SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512 + Input in0 0 1 in0 + +.. caution:: + + Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn` + to change the values for ``SherpaMetaData`` if your model uses a different setting. + + +Next, let us compile `sherpa-ncnn`_ since we will quantize our models within +`sherpa-ncnn`_. + +.. code-block:: bash + + # We will download sherpa-ncnn to $HOME/open-source/ + # You can change it to anywhere you like. + cd $HOME + mkdir -p open-source + + cd open-source + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + cmake .. + make -j 4 + + ./bin/generate-int8-scale-table + + export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH + +The output of the above commands are: + +.. code-block:: bash + + (py38) kuangfangjun:build$ generate-int8-scale-table + Please provide 10 arg. Currently given: 1 + Usage: + generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt + + Each line in wave_filenames.txt is a path to some 16k Hz mono wave file. + +We need to create a file ``wave_filenames.txt``, in which we need to put +some calibration wave files. For testing purpose, we put the ``test_wavs`` +from the pre-trained model repository ``_ + +.. code-block:: bash + + cd egs/librispeech/ASR + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/ + + cat < wave_filenames.txt + ../test_wavs/1089-134686-0001.wav + ../test_wavs/1221-135766-0001.wav + ../test_wavs/1221-135766-0002.wav + EOF + +Now we can calculate the scales needed for quantization with the calibration data: + +.. code-block:: bash + + cd egs/librispeech/ASR + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/ + + generate-int8-scale-table \ + ./encoder_jit_trace-pnnx.ncnn.param \ + ./encoder_jit_trace-pnnx.ncnn.bin \ + ./decoder_jit_trace-pnnx.ncnn.param \ + ./decoder_jit_trace-pnnx.ncnn.bin \ + ./joiner_jit_trace-pnnx.ncnn.param \ + ./joiner_jit_trace-pnnx.ncnn.bin \ + ./encoder-scale-table.txt \ + ./joiner-scale-table.txt \ + ./wave_filenames.txt + +The output logs are in the following: + +.. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt + +It generates the following two files: + +.. code-block:: bash + + $ ls -lh encoder-scale-table.txt joiner-scale-table.txt + -rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt + -rw-r--r-- 1 kuangfangjun root 18K Jan 11 17:28 joiner-scale-table.txt + +.. caution:: + + Definitely, you need more calibration data to compute the scale table. + +Finally, let us use the scale table to quantize our models into ``int8``. + +.. code-block:: bash + + ncnn2int8 + + usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table] + +First, we quantize the encoder model: + +.. code-block:: bash + + cd egs/librispeech/ASR + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/ + + ncnn2int8 \ + ./encoder_jit_trace-pnnx.ncnn.param \ + ./encoder_jit_trace-pnnx.ncnn.bin \ + ./encoder_jit_trace-pnnx.ncnn.int8.param \ + ./encoder_jit_trace-pnnx.ncnn.int8.bin \ + ./encoder-scale-table.txt + +Next, we quantize the joiner model: + +.. code-block:: bash + + ncnn2int8 \ + ./joiner_jit_trace-pnnx.ncnn.param \ + ./joiner_jit_trace-pnnx.ncnn.bin \ + ./joiner_jit_trace-pnnx.ncnn.int8.param \ + ./joiner_jit_trace-pnnx.ncnn.int8.bin \ + ./joiner-scale-table.txt + +The above two commands generate the following 4 files: + +.. code-block:: bash + + -rw-r--r-- 1 kuangfangjun root 99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin + -rw-r--r-- 1 kuangfangjun root 78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param + -rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin + -rw-r--r-- 1 kuangfangjun root 496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param + +Congratulations! You have successfully quantized your model from ``float32`` to ``int8``. + +.. caution:: + + ``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs. + + You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param`` + and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like. + + For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can + replace the following invocation: + + .. code-block:: + + cd egs/librispeech/ASR + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/ + + sherpa-ncnn \ + ../data/lang_bpe_500/tokens.txt \ + ./encoder_jit_trace-pnnx.ncnn.param \ + ./encoder_jit_trace-pnnx.ncnn.bin \ + ./decoder_jit_trace-pnnx.ncnn.param \ + ./decoder_jit_trace-pnnx.ncnn.bin \ + ./joiner_jit_trace-pnnx.ncnn.param \ + ./joiner_jit_trace-pnnx.ncnn.bin \ + ../test_wavs/1089-134686-0001.wav + + with + + .. code-block:: + + cd egs/librispeech/ASR + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/ + + sherpa-ncnn \ + ../data/lang_bpe_500/tokens.txt \ + ./encoder_jit_trace-pnnx.ncnn.int8.param \ + ./encoder_jit_trace-pnnx.ncnn.int8.bin \ + ./decoder_jit_trace-pnnx.ncnn.param \ + ./decoder_jit_trace-pnnx.ncnn.bin \ + ./joiner_jit_trace-pnnx.ncnn.param \ + ./joiner_jit_trace-pnnx.ncnn.bin \ + ../test_wavs/1089-134686-0001.wav + + +The following table compares again the file sizes: + + ++----------------------------------------+------------+ +| File name | File size | ++----------------------------------------+------------+ +| encoder_jit_trace-pnnx.pt | 283 MB | ++----------------------------------------+------------+ +| decoder_jit_trace-pnnx.pt | 1010 KB | ++----------------------------------------+------------+ +| joiner_jit_trace-pnnx.pt | 3.0 MB | ++----------------------------------------+------------+ +| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB | ++----------------------------------------+------------+ +| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB | ++----------------------------------------+------------+ +| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB | ++----------------------------------------+------------+ +| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB | ++----------------------------------------+------------+ +| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB | ++----------------------------------------+------------+ +| encoder_jit_trace-pnnx.ncnn.int8.bin | 99 MB | ++----------------------------------------+------------+ +| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB | ++----------------------------------------+------------+ + +You can see that the file sizes of the model after ``int8`` quantization +are much smaller. + +.. hint:: + + Currently, only linear layers and convolutional layers are quantized + with ``int8``, so you don't see an exact ``4x`` reduction in file sizes. + +.. note:: + + You need to test the recognition accuracy after ``int8`` quantization. + +You can find the speed comparison at ``_. + + +That's it! Have fun with `sherpa-ncnn`_! diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst new file mode 100644 index 000000000..ea9f350cd --- /dev/null +++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst @@ -0,0 +1,223 @@ +Distillation with HuBERT +======================== + +This tutorial shows you how to perform knowledge distillation in `icefall`_ +with the `LibriSpeech`_ dataset. The distillation method +used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD). +Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation `_ +for more details about MVQ-KD. + +.. note:: + + This tutorial is based on recipe + `pruned_transducer_stateless4 `_. + Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes + with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you + encounter any problems, please open an issue here `icefall `_. + +.. note:: + + We assume you have read the page :ref:`install icefall` and have setup + the environment for `icefall`_. + +.. HINT:: + + We recommend you to use a GPU or several GPUs to run this recipe. + +Data preparation +---------------- + +We first prepare necessary training data for `LibriSpeech`_. +This is the same as in :ref:`non_streaming_librispeech_pruned_transducer_stateless`. + +.. hint:: + + The data preparation is the same as other recipes on LibriSpeech dataset, + if you have finished this step, you can skip to :ref:`codebook_index_preparation` directly. + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./prepare.sh + +The script ``./prepare.sh`` handles the data preparation for you, **automagically**. +All you need to do is to run it. + +The data preparation contains several stages, you can use the following two +options: + + - ``--stage`` + - ``--stop-stage`` + +to control which stage(s) should be run. By default, all stages are executed. + +For example, + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0 + $ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5 + +.. HINT:: + + If you have pre-downloaded the `LibriSpeech`_ + dataset and the `musan`_ dataset, say, + they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify + the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that + ``./prepare.sh`` won't re-download them. + +.. NOTE:: + + All generated files by ``./prepare.sh``, e.g., features, lexicon, etc, + are saved in ``./data`` directory. + +We provide the following YouTube video showing how to run ``./prepare.sh``. + +.. note:: + + To get the latest news of `next-gen Kaldi `_, please subscribe + the following YouTube channel by `Nadira Povey `_: + + ``_ + +.. youtube:: ofEIoJL-mGM + + +.. _codebook_index_preparation: + +Codebook index preparation +-------------------------- + +Here, we prepare necessary data for MVQ-KD. This requires the generation +of codebook indexes (please read our `paper `_. +if you are interested in details). In this tutorial, we use the pre-computed +codebook indexes for convenience. The only thing you need to do is to +run `./distillation_with_hubert.sh `_. + +.. note:: + + There are 5 stages in total, the first and second stage will be automatically skipped + when choosing to downloaded codebook indexes prepared by `icefall`_. + Of course, you can extract and compute the codebook indexes by yourself. This + will require you downloading a HuBERT-XL model and it can take a while for + the extraction of codebook indexes. + + +As usual, you can control the stages you want to run by specifying the following +two options: + + - ``--stage`` + - ``--stop-stage`` + +For example, + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0 + $ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5 + +Here are a few options in `./distillation_with_hubert.sh `_ +you need to know before you proceed. + +- ``--full_libri`` If True, use full 960h data. Otherwise only ``train-clean-100`` will be used +- ``--use_extracted_codebook`` If True, the first two stages will be skipped and the codebook + indexes uploaded by us will be downloaded. + +Since we are using the pre-computed codebook indexes, we set +``use_extracted_codebook=True``. If you want to do full `LibriSpeech`_ +experiments, please set ``full_libri=True``. + +The following command downloads the pre-computed codebook indexes +and prepares MVQ-augmented training manifests. + +.. code-block:: bash + + $ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2 + +Please see the +following screenshot for the output of an example execution. + +.. figure:: ./images/distillation_codebook.png + :width: 800 + :alt: Downloading codebook indexes and preparing training manifest. + :align: center + + Downloading codebook indexes and preparing training manifest. + +.. hint:: + + The codebook indexes we prepared for you in this tutorial + are extracted from the 36-th layer of a fine-tuned HuBERT-XL model + with 8 codebooks. If you want to try other configurations, please + set ``use_extracted_codebook=False`` and set ``embedding_layer`` and + ``num_codebooks`` by yourself. + +Now, you should see the following files under the directory ``./data/vq_fbank_layer36_cb8``. + +.. figure:: ./images/distillation_directory.png + :width: 800 + :alt: MVQ-augmented training manifests + :align: center + + MVQ-augmented training manifests. + +Whola! You are ready to perform knowledge distillation training now! + +Training +-------- + +To perform training, please run stage 3 by executing the following command. + +.. code-block:: bash + + $ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training + +Here is the code snippet for training: + +.. code-block:: bash + + WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}') + + ./pruned_transducer_stateless6/train.py \ + --manifest-dir ./data/vq_fbank_layer36_cb8 \ + --master-port 12359 \ + --full-libri $full_libri \ + --spec-aug-time-warp-factor -1 \ + --max-duration 300 \ + --world-size ${WORLD_SIZE} \ + --num-epochs 30 \ + --exp-dir $exp_dir \ + --enable-distillation True \ + --codebook-loss-scale 0.01 + +There are a few training arguments in the following +training commands that should be paid attention to. + + - ``--enable-distillation`` If True, knowledge distillation training is enabled. + - ``--codebook-loss-scale`` The scale of the knowledge distillation loss. + - ``--manifest-dir`` The path to the MVQ-augmented manifest. + + +Decoding +-------- + +After training finished, you can test the performance on using +the following command. + +.. code-block:: bash + + export CUDA_VISIBLE_DEVICES=0 + ./pruned_transducer_stateless6/train.py \ + --decoding-method "modified_beam_search" \ + --epoch 30 \ + --avg 10 \ + --max-duration 200 \ + --exp-dir $exp_dir \ + --enable-distillation True + +You should get similar results as `here `_. + +That's all! Feel free to experiment with your own setups and report your results. +If you encounter any problems during training, please open up an issue `here `_. diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png new file mode 100644 index 000000000..1a40d6c6e Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png differ diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png new file mode 100644 index 000000000..30763046f Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png differ diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst index 3ebb36b25..bf439861a 100644 --- a/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst +++ b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst @@ -9,3 +9,4 @@ LibriSpeech pruned_transducer_stateless zipformer_mmi zipformer_ctc_blankskip + distillation diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst index 86d43c8fe..42fd3df77 100644 --- a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst +++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst @@ -1,3 +1,5 @@ +.. _non_streaming_librispeech_pruned_transducer_stateless: + Pruned transducer statelessX ============================ diff --git a/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst b/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst index 643855cc2..ce8ba1453 100644 --- a/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst +++ b/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst @@ -515,10 +515,10 @@ To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``: Please see ``_ for how to use the exported models in ``sherpa``. -.. _export-model-for-ncnn: +.. _export-lstm-transducer-model-for-ncnn: -Export model for ncnn -~~~~~~~~~~~~~~~~~~~~~ +Export LSTM transducer models for ncnn +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ We support exporting pretrained LSTM transducer models to `ncnn `_ using @@ -531,16 +531,36 @@ First, let us install a modified version of ``ncnn``: git clone https://github.com/csukuangfj/ncnn cd ncnn git submodule update --recursive --init - python3 setup.py bdist_wheel - ls -lh dist/ - pip install ./dist/*.whl + + # Note: We don't use "python setup.py install" or "pip install ." here + + mkdir -p build-wheel + cd build-wheel + + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DNCNN_PYTHON=ON \ + -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_BUILD_EXAMPLES=OFF \ + -DNCNN_BUILD_TOOLS=ON \ + .. + + make -j4 + + cd .. + + # Note: $PWD here is /path/to/ncnn + + export PYTHONPATH=$PWD/python:$PYTHONPATH + export PATH=$PWD/tools/pnnx/build/src:$PATH + export PATH=$PWD/build-wheel/tools/quantize:$PATH # now build pnnx cd tools/pnnx mkdir build cd build + cmake .. make -j4 - export PATH=$PWD/src:$PATH ./src/pnnx @@ -549,6 +569,9 @@ First, let us install a modified version of ``ncnn``: We assume that you have added the path to the binary ``pnnx`` to the environment variable ``PATH``. + We also assume that you have added ``build/tools/quantize`` to the environment + variable ``PATH`` so that you are able to use ``ncnn2int8`` later. + Second, let us export the model using ``torch.jit.trace()`` that is suitable for ``pnnx``: @@ -634,3 +657,6 @@ by visiting the following links: You can find more usages of the pretrained models in ``_ + +Export ConvEmformer transducer models for ncnn +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py index 188059044..f0c92a9b4 100644 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py @@ -1512,24 +1512,6 @@ class EmformerEncoder(nn.Module): ) return states - attn_caches = [ - [ - torch.zeros(self.memory_size, self.d_model, device=device), - torch.zeros(self.left_context_length, self.d_model, device=device), - torch.zeros(self.left_context_length, self.d_model, device=device), - ] - for _ in range(self.num_encoder_layers) - ] - conv_caches = [ - torch.zeros(self.d_model, self.cnn_module_kernel - 1, device=device) - for _ in range(self.num_encoder_layers) - ] - states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]] = ( - attn_caches, - conv_caches, - ) - return states - class Emformer(EncoderInterface): def __init__( diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py index b21fe5c7e..e4104a5bb 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py @@ -131,6 +131,8 @@ class Model: encoder_net = ncnn.Net() encoder_net.opt.use_packing_layout = False encoder_net.opt.use_fp16_storage = False + encoder_net.opt.num_threads = 4 + encoder_param = args.encoder_param_filename encoder_model = args.encoder_bin_filename @@ -144,6 +146,7 @@ class Model: decoder_model = args.decoder_bin_filename decoder_net = ncnn.Net() + decoder_net.opt.num_threads = 4 decoder_net.load_param(decoder_param) decoder_net.load_model(decoder_model) @@ -154,6 +157,8 @@ class Model: joiner_param = args.joiner_param_filename joiner_model = args.joiner_bin_filename joiner_net = ncnn.Net() + joiner_net.opt.num_threads = 4 + joiner_net.load_param(joiner_param) joiner_net.load_model(joiner_model) @@ -176,7 +181,6 @@ class Model: - next_states, a list of tensors containing the next states """ with self.encoder_net.create_extractor() as ex: - ex.set_num_threads(4) ex.input("in0", ncnn.Mat(x.numpy()).clone()) # layer0 in2-in5 @@ -220,7 +224,6 @@ class Model: assert decoder_input.dtype == torch.int32 with self.decoder_net.create_extractor() as ex: - ex.set_num_threads(4) ex.input("in0", ncnn.Mat(decoder_input.numpy()).clone()) ret, ncnn_out0 = ex.extract("out0") assert ret == 0, ret @@ -229,7 +232,6 @@ class Model: def run_joiner(self, encoder_out, decoder_out): with self.joiner_net.create_extractor() as ex: - ex.set_num_threads(4) ex.input("in0", ncnn.Mat(encoder_out.numpy()).clone()) ex.input("in1", ncnn.Mat(decoder_out.numpy()).clone()) ret, ncnn_out0 = ex.extract("out0") diff --git a/egs/librispeech/ASR/distillation_with_hubert.sh b/egs/librispeech/ASR/distillation_with_hubert.sh index a38cf590c..6aaa0333b 100755 --- a/egs/librispeech/ASR/distillation_with_hubert.sh +++ b/egs/librispeech/ASR/distillation_with_hubert.sh @@ -150,7 +150,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then num_codebooks=8 mkdir -p $exp_dir/vq - codebook_dir=$exp_dir/vq/${teacher_model_id}_layer${embedding_layer}_cb${num_codebooks} + codebook_dir=$exp_dir/vq/${teacher_model_id} mkdir -p codebook_dir codebook_download_dir=$exp_dir/download_codebook if [ -d $codebook_download_dir ]; then @@ -180,9 +180,9 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then ./pruned_transducer_stateless6/extract_codebook_index.py \ --full-libri $full_libri \ --exp-dir $exp_dir \ - --embedding-layer 36 \ + --embedding-layer $embedding_layer \ --num-utts 1000 \ - --num-codebooks 8 \ + --num-codebooks $num_codebooks \ --max-duration 100 \ --teacher-model-id $teacher_model_id \ --use-extracted-codebook $use_extracted_codebook diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py index 3b471fa85..3bd1b0a09 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py @@ -104,6 +104,8 @@ class Model: encoder_net = ncnn.Net() encoder_net.opt.use_packing_layout = False encoder_net.opt.use_fp16_storage = False + encoder_net.opt.num_threads = 4 + encoder_param = args.encoder_param_filename encoder_model = args.encoder_bin_filename @@ -118,6 +120,7 @@ class Model: decoder_net = ncnn.Net() decoder_net.opt.use_packing_layout = False + decoder_net.opt.num_threads = 4 decoder_net.load_param(decoder_param) decoder_net.load_model(decoder_model) @@ -129,6 +132,8 @@ class Model: joiner_model = args.joiner_bin_filename joiner_net = ncnn.Net() joiner_net.opt.use_packing_layout = False + joiner_net.opt.num_threads = 4 + joiner_net.load_param(joiner_param) joiner_net.load_model(joiner_model) @@ -136,7 +141,6 @@ class Model: def run_encoder(self, x, states): with self.encoder_net.create_extractor() as ex: - ex.set_num_threads(10) ex.input("in0", ncnn.Mat(x.numpy()).clone()) x_lens = torch.tensor([x.size(0)], dtype=torch.float32) ex.input("in1", ncnn.Mat(x_lens.numpy()).clone()) @@ -165,7 +169,6 @@ class Model: assert decoder_input.dtype == torch.int32 with self.decoder_net.create_extractor() as ex: - ex.set_num_threads(10) ex.input("in0", ncnn.Mat(decoder_input.numpy()).clone()) ret, ncnn_out0 = ex.extract("out0") assert ret == 0, ret @@ -174,7 +177,6 @@ class Model: def run_joiner(self, encoder_out, decoder_out): with self.joiner_net.create_extractor() as ex: - ex.set_num_threads(10) ex.input("in0", ncnn.Mat(encoder_out.numpy()).clone()) ex.input("in1", ncnn.Mat(decoder_out.numpy()).clone()) ret, ncnn_out0 = ex.extract("out0") diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py index baff15ea6..02ed16a8c 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py @@ -92,6 +92,8 @@ class Model: encoder_net = ncnn.Net() encoder_net.opt.use_packing_layout = False encoder_net.opt.use_fp16_storage = False + encoder_net.opt.num_threads = 4 + encoder_param = args.encoder_param_filename encoder_model = args.encoder_bin_filename @@ -106,6 +108,7 @@ class Model: decoder_net = ncnn.Net() decoder_net.opt.use_packing_layout = False + decoder_net.opt.num_threads = 4 decoder_net.load_param(decoder_param) decoder_net.load_model(decoder_model) @@ -117,6 +120,8 @@ class Model: joiner_model = args.joiner_bin_filename joiner_net = ncnn.Net() joiner_net.opt.use_packing_layout = False + joiner_net.opt.num_threads = 4 + joiner_net.load_param(joiner_param) joiner_net.load_model(joiner_model) @@ -124,7 +129,6 @@ class Model: def run_encoder(self, x, states): with self.encoder_net.create_extractor() as ex: - # ex.set_num_threads(10) ex.input("in0", ncnn.Mat(x.numpy()).clone()) x_lens = torch.tensor([x.size(0)], dtype=torch.float32) ex.input("in1", ncnn.Mat(x_lens.numpy()).clone()) @@ -153,7 +157,6 @@ class Model: assert decoder_input.dtype == torch.int32 with self.decoder_net.create_extractor() as ex: - # ex.set_num_threads(10) ex.input("in0", ncnn.Mat(decoder_input.numpy()).clone()) ret, ncnn_out0 = ex.extract("out0") assert ret == 0, ret @@ -162,7 +165,6 @@ class Model: def run_joiner(self, encoder_out, decoder_out): with self.joiner_net.create_extractor() as ex: - # ex.set_num_threads(10) ex.input("in0", ncnn.Mat(encoder_out.numpy()).clone()) ex.input("in1", ncnn.Mat(decoder_out.numpy()).clone()) ret, ncnn_out0 = ex.extract("out0") diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py index 9c23e7d66..4b373e4c7 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py @@ -44,7 +44,7 @@ Usage: --exp-dir ./pruned_transducer_stateless7_ctc/exp \ --max-duration 600 \ --hlg-scale 0.8 \ - --decoding-method 1best + --decoding-method nbest (4) nbest-rescoring ./pruned_transducer_stateless7_ctc/ctc_decode.py \ diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py index 0ef733226..f137485b2 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py @@ -42,7 +42,7 @@ Usage: --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \ --max-duration 600 \ --hlg-scale 0.8 \ - --decoding-method 1best + --decoding-method nbest (4) nbest-rescoring ./pruned_transducer_stateless7_ctc_bs/ctc_decode.py \ --epoch 30 \