diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000..3abb38f8b
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,24 @@
+
+## Usage
+
+```bash
+cd /path/to/icefall/docs
+pip install -r requirements.txt
+make clean
+make html
+cd build/html
+python3 -m http.server 8000
+```
+
+It prints:
+
+```
+Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
+```
+
+Open your browser and go to <http://0.0.0.0:8000/> to view the generated
+documentation.
+
+Done!
+
+**Hint**: You can change the port number when starting the server.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 221d9d734..ef9fe1445 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -78,3 +78,12 @@ html_context = {
 }
 
 todo_include_todos = True
+
+rst_epilog = """
+.. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn
+.. _icefall: https://github.com/k2-fsa/icefall
+.. _git-lfs: https://git-lfs.com/
+.. _ncnn: https://github.com/tencent/ncnn
+.. _LibriSpeech: https://www.openslr.org/12
+.. _musan: http://www.openslr.org/17/
+"""
diff --git a/docs/source/faqs.rst b/docs/source/faqs.rst
new file mode 100644
index 000000000..72b0302d7
--- /dev/null
+++ b/docs/source/faqs.rst
@@ -0,0 +1,107 @@
+Frequently Asked Questions (FAQs)
+=================================
+
+In this section, we collect issues reported by users and post the corresponding
+solutions.
+
+
+OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
+-----------------------------------------------------------------------------------
+
+One user is using the following code to install ``torch`` and ``torchaudio``:
+
+.. code-block:: bash
+
+  pip install \
+    torch==1.10.0+cu111 \
+    torchvision==0.11.0+cu111 \
+    torchaudio==0.10.0 \
+    -f https://download.pytorch.org/whl/torch_stable.html
+
+and it throws the following error when running ``tdnn/train.py``:
+
+.. code-block::
+
+  OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
+
+The fix is to specify the CUDA version while installing ``torchaudio``. That
+is, change ``torchaudio==0.10.0`` to ``torchaudio==0.10.0+cu11```. Therefore,
+the correct command is:
+
+.. code-block:: bash
+
+  pip install \
+    torch==1.10.0+cu111 \
+    torchvision==0.11.0+cu111 \
+    torchaudio==0.10.0+cu111 \
+    -f https://download.pytorch.org/whl/torch_stable.html
+
+AttributeError: module 'distutils' has no attribute 'version'
+-------------------------------------------------------------
+
+The error log is:
+
+.. code-block::
+
+  Traceback (most recent call last):
+    File "./tdnn/train.py", line 14, in <module>
+      from asr_datamodule import YesNoAsrDataModule
+    File "/home/xxx/code/next-gen-kaldi/icefall/egs/yesno/ASR/tdnn/asr_datamodule.py", line 34, in <module>
+      from icefall.dataset.datamodule import DataModule
+    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/__init__.py", line 3, in <module>
+      from . import (
+    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/decode.py", line 23, in <module>
+      from icefall.utils import add_eos, add_sos, get_texts
+    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/utils.py", line 39, in <module>
+      from torch.utils.tensorboard import SummaryWriter
+    File "/home/xxx/tool/miniconda3/envs/yyy/lib/python3.8/site-packages/torch/utils/tensorboard/__init__.py", line 4, in <module>
+      LooseVersion = distutils.version.LooseVersion
+  AttributeError: module 'distutils' has no attribute 'version'
+
+The fix is:
+
+.. code-block:: bash
+
+  pip uninstall setuptools
+
+  pip install setuptools==58.0.4
+
+ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
+--------------------------------------------------------------------------------------------
+
+If you are using ``conda`` and encounter the following issue:
+
+.. code-block::
+
+  Traceback (most recent call last):
+    File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 24, in <module>
+      from _k2 import DeterminizeWeightPushingType
+  ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
+
+  During handling of the above exception, another exception occurred:
+
+  Traceback (most recent call last):
+    File "/k2-dev/yangyifan/icefall/egs/librispeech/ASR/./pruned_transducer_stateless7_ctc_bs/decode.py", line 104, in <module>
+      import k2
+    File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 30, in <module>
+      raise ImportError(
+  ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
+  Note: If you're using anaconda and importing k2 on MacOS,
+        you can probably fix this by setting the environment variable:
+    export DYLD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages:$DYLD_LIBRARY_PATH
+
+Please first try to find where ``libpython3.10.so.1.0`` locates.
+
+For instance,
+
+.. code-block:: bash
+
+  cd $CONDA_PREFIX/lib
+  find . -name "libpython*"
+
+If you are able to find it inside ``$CODNA_PREFIX/lib``, please set the
+following environment variable:
+
+.. code-block:: bash
+
+  export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4ea446259..8d76eb68b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -21,6 +21,7 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
    :caption: Contents:
 
    installation/index
+   faqs
    model-export/index
 
 .. toctree::
diff --git a/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt b/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt
new file mode 100644
index 000000000..ecbdd4b31
--- /dev/null
+++ b/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt
@@ -0,0 +1,21 @@
+2023-01-11 12:15:38,677 INFO [export-for-ncnn.py:220] device: cpu
+2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:229] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_v
+alid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampl
+ing_factor': 4, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.2', 'k2-build-type':
+'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'a34171ed85605b0926eebbd0463d059431f4f74a', 'k2-git-date': 'Wed Dec 14 00:06:38 2022',
+ 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-vers
+ion': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'fix-stateless3-train-2022-12-27', 'icefall-git-sha1': '530e8a1-dirty', '
+icefall-git-date': 'Tue Dec 27 13:59:18 2022', 'icefall-path': '/star-fj/fangjun/open-source/icefall', 'k2-path': '/star-fj/fangjun/op
+en-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279
+-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '127.0.0.1'}, 'epoch': 30, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefa
+ll-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp'), 'bpe_model': './icefall-asr-librispeech-conv-emformer-transdu
+cer-stateless2-2022-07-05//data/lang_bpe_500/bpe.model', 'jit': False, 'context_size': 2, 'use_averaged_model': False, 'encoder_dim':
+512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'cnn_module_kernel': 31, 'left_context_length': 32, 'chunk_length'
+: 32, 'right_context_length': 8, 'memory_size': 32, 'blank_id': 0, 'vocab_size': 500}
+2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:231] About to create model
+2023-01-11 12:15:40,053 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-conv-emformer-transducer-stateless2-2
+022-07-05/exp/epoch-30.pt
+2023-01-11 12:15:40,708 INFO [export-for-ncnn.py:315] Number of model parameters: 75490012
+2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:318] Using torch.jit.trace()
+2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:320] Exporting encoder
+2023-01-11 12:15:41,682 INFO [export-for-ncnn.py:149] chunk_length: 32, right_context_length: 8
diff --git a/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt b/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt
new file mode 100644
index 000000000..347e7e51a
--- /dev/null
+++ b/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt
@@ -0,0 +1,104 @@
+Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1
+num encoder conv layers: 88
+num joiner conv layers: 3
+num files: 3
+Processing ../test_wavs/1089-134686-0001.wav
+Processing ../test_wavs/1221-135766-0001.wav
+Processing ../test_wavs/1221-135766-0002.wav
+Processing ../test_wavs/1089-134686-0001.wav
+Processing ../test_wavs/1221-135766-0001.wav
+Processing ../test_wavs/1221-135766-0002.wav
+----------encoder----------
+conv_87                                  : max = 15.942385        threshold = 15.938493        scale = 7.968131
+conv_88                                  : max = 35.442448        threshold = 15.549335        scale = 8.167552
+conv_89                                  : max = 23.228289        threshold = 8.001738         scale = 15.871552
+linear_90                                : max = 3.976146         threshold = 1.101789         scale = 115.267128
+linear_91                                : max = 6.962030         threshold = 5.162033         scale = 24.602713
+linear_92                                : max = 12.323041        threshold = 3.853959         scale = 32.953129
+linear_94                                : max = 6.905416         threshold = 4.648006         scale = 27.323545
+linear_93                                : max = 6.905416         threshold = 5.474093         scale = 23.200188
+linear_95                                : max = 1.888012         threshold = 1.403563         scale = 90.483986
+linear_96                                : max = 6.856741         threshold = 5.398679         scale = 23.524273
+linear_97                                : max = 9.635942         threshold = 2.613655         scale = 48.590950
+linear_98                                : max = 6.460340         threshold = 5.670146         scale = 22.398010
+linear_99                                : max = 9.532276         threshold = 2.585537         scale = 49.119396
+linear_101                               : max = 6.585871         threshold = 5.719224         scale = 22.205809
+linear_100                               : max = 6.585871         threshold = 5.751382         scale = 22.081648
+linear_102                               : max = 1.593344         threshold = 1.450581         scale = 87.551147
+linear_103                               : max = 6.592681         threshold = 5.705824         scale = 22.257959
+linear_104                               : max = 8.752957         threshold = 1.980955         scale = 64.110489
+linear_105                               : max = 6.696240         threshold = 5.877193         scale = 21.608953
+linear_106                               : max = 9.059659         threshold = 2.643138         scale = 48.048950
+linear_108                               : max = 6.975461         threshold = 4.589567         scale = 27.671457
+linear_107                               : max = 6.975461         threshold = 6.190381         scale = 20.515701
+linear_109                               : max = 3.710759         threshold = 2.305635         scale = 55.082436
+linear_110                               : max = 7.531228         threshold = 5.731162         scale = 22.159557
+linear_111                               : max = 10.528083        threshold = 2.259322         scale = 56.211544
+linear_112                               : max = 8.148807         threshold = 5.500842         scale = 23.087374
+linear_113                               : max = 8.592566         threshold = 1.948851         scale = 65.166611
+linear_115                               : max = 8.437109         threshold = 5.608947         scale = 22.642395
+linear_114                               : max = 8.437109         threshold = 6.193942         scale = 20.503904
+linear_116                               : max = 3.966980         threshold = 3.200896         scale = 39.676392
+linear_117                               : max = 9.451303         threshold = 6.061664         scale = 20.951344
+linear_118                               : max = 12.077262        threshold = 3.965800         scale = 32.023804
+linear_119                               : max = 9.671615         threshold = 4.847613         scale = 26.198460
+linear_120                               : max = 8.625638         threshold = 3.131427         scale = 40.556595
+linear_122                               : max = 10.274080        threshold = 4.888716         scale = 25.978189
+linear_121                               : max = 10.274080        threshold = 5.420480         scale = 23.429659
+linear_123                               : max = 4.826197         threshold = 3.599617         scale = 35.281532
+linear_124                               : max = 11.396383        threshold = 7.325849         scale = 17.335875
+linear_125                               : max = 9.337198         threshold = 3.941410         scale = 32.221970
+linear_126                               : max = 9.699965         threshold = 4.842878         scale = 26.224073
+linear_127                               : max = 8.775370         threshold = 3.884215         scale = 32.696438
+linear_129                               : max = 9.872276         threshold = 4.837319         scale = 26.254213
+linear_128                               : max = 9.872276         threshold = 7.180057         scale = 17.687883
+linear_130                               : max = 4.150427         threshold = 3.454298         scale = 36.765789
+linear_131                               : max = 11.112692        threshold = 7.924847         scale = 16.025545
+linear_132                               : max = 11.852893        threshold = 3.116593         scale = 40.749626
+linear_133                               : max = 11.517084        threshold = 5.024665         scale = 25.275314
+linear_134                               : max = 10.683807        threshold = 3.878618         scale = 32.743618
+linear_136                               : max = 12.421055        threshold = 6.322729         scale = 20.086264
+linear_135                               : max = 12.421055        threshold = 5.309880         scale = 23.917679
+linear_137                               : max = 4.827781         threshold = 3.744595         scale = 33.915554
+linear_138                               : max = 14.422395        threshold = 7.742882         scale = 16.402161
+linear_139                               : max = 8.527538         threshold = 3.866123         scale = 32.849449
+linear_140                               : max = 12.128619        threshold = 4.657793         scale = 27.266134
+linear_141                               : max = 9.839593         threshold = 3.845993         scale = 33.021378
+linear_143                               : max = 12.442304        threshold = 7.099039         scale = 17.889746
+linear_142                               : max = 12.442304        threshold = 5.325038         scale = 23.849592
+linear_144                               : max = 5.929444         threshold = 5.618206         scale = 22.605080
+linear_145                               : max = 13.382126        threshold = 9.321095         scale = 13.625010
+linear_146                               : max = 9.894987         threshold = 3.867645         scale = 32.836517
+linear_147                               : max = 10.915313        threshold = 4.906028         scale = 25.886522
+linear_148                               : max = 9.614287         threshold = 3.908151         scale = 32.496181
+linear_150                               : max = 11.724932        threshold = 4.485588         scale = 28.312899
+linear_149                               : max = 11.724932        threshold = 5.161146         scale = 24.606939
+linear_151                               : max = 7.164453         threshold = 5.847355         scale = 21.719223
+linear_152                               : max = 13.086471        threshold = 5.984121         scale = 21.222834
+linear_153                               : max = 11.099524        threshold = 3.991601         scale = 31.816805
+linear_154                               : max = 10.054585        threshold = 4.489706         scale = 28.286930
+linear_155                               : max = 12.389185        threshold = 3.100321         scale = 40.963501
+linear_157                               : max = 9.982999         threshold = 5.154796         scale = 24.637253
+linear_156                               : max = 9.982999         threshold = 8.537706         scale = 14.875190
+linear_158                               : max = 8.420287         threshold = 6.502287         scale = 19.531588
+linear_159                               : max = 25.014746        threshold = 9.423280         scale = 13.477261
+linear_160                               : max = 45.633553        threshold = 5.715335         scale = 22.220921
+linear_161                               : max = 20.371849        threshold = 5.117830         scale = 24.815203
+linear_162                               : max = 12.492933        threshold = 3.126283         scale = 40.623318
+linear_164                               : max = 20.697504        threshold = 4.825712         scale = 26.317358
+linear_163                               : max = 20.697504        threshold = 5.078367         scale = 25.008038
+linear_165                               : max = 9.023975         threshold = 6.836278         scale = 18.577358
+linear_166                               : max = 34.860619        threshold = 7.259792         scale = 17.493614
+linear_167                               : max = 30.380934        threshold = 5.496160         scale = 23.107042
+linear_168                               : max = 20.691216        threshold = 4.733317         scale = 26.831076
+linear_169                               : max = 9.723948         threshold = 3.952728         scale = 32.129707
+linear_171                               : max = 21.034811        threshold = 5.366547         scale = 23.665123
+linear_170                               : max = 21.034811        threshold = 5.356277         scale = 23.710501
+linear_172                               : max = 10.556884        threshold = 5.729481         scale = 22.166058
+linear_173                               : max = 20.033039        threshold = 10.207264        scale = 12.442120
+linear_174                               : max = 11.597379        threshold = 2.658676         scale = 47.768131
+----------joiner----------
+linear_2                                 : max = 19.293503        threshold = 14.305265        scale = 8.877850
+linear_1                                 : max = 10.812222        threshold = 8.766452         scale = 14.487047
+linear_3                                 : max = 0.999999         threshold = 0.999755         scale = 127.031174
+ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233...
diff --git a/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt b/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
new file mode 100644
index 000000000..114fe7342
--- /dev/null
+++ b/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
@@ -0,0 +1,7 @@
+2023-01-11 14:02:12,216 INFO [streaming-ncnn-decode.py:320] {'tokens': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav'}
+T 51 32
+2023-01-11 14:02:13,141 INFO [streaming-ncnn-decode.py:328] Constructing Fbank computer
+2023-01-11 14:02:13,151 INFO [streaming-ncnn-decode.py:331] Reading sound files: ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
+2023-01-11 14:02:13,176 INFO [streaming-ncnn-decode.py:336] torch.Size([106000])
+2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:380] ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
+2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:381] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
diff --git a/docs/source/model-export/export-ncnn.rst b/docs/source/model-export/export-ncnn.rst
index 3dbb8b514..ed0264089 100644
--- a/docs/source/model-export/export-ncnn.rst
+++ b/docs/source/model-export/export-ncnn.rst
@@ -1,12 +1,771 @@
 Export to ncnn
 ==============
 
-We support exporting LSTM transducer models to `ncnn <https://github.com/tencent/ncnn>`_.
-
-Please refer to :ref:`export-model-for-ncnn` for details.
+We support exporting both
+`LSTM transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
+and
+`ConvEmformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_
+to `ncnn <https://github.com/tencent/ncnn>`_.
 
 We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
 performing speech recognition using ``ncnn`` with exported models.
-It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is
-self-contained and can be statically linked to produce a binary containing
-everything needed.
+It has been tested on Linux, macOS, Windows, ``Android``, and ``Raspberry Pi``.
+
+`sherpa-ncnn`_ is self-contained and can be statically linked to produce
+a binary containing everything needed. Please refer
+to its documentation for details:
+
+ - `<https://k2-fsa.github.io/sherpa/ncnn/index.html>`_
+
+
+Export LSTM transducer models
+-----------------------------
+
+Please refer to :ref:`export-lstm-transducer-model-for-ncnn` for details.
+
+
+
+Export ConvEmformer transducer models
+-------------------------------------
+
+We use the pre-trained model from the following repository as an example:
+
+  - `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
+
+We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
+
+.. hint::
+
+  We use ``Ubuntu 18.04``, ``torch 1.10``, and ``Python 3.8`` for testing.
+
+.. caution::
+
+  Please use a more recent version of PyTorch. For instance, ``torch 1.8``
+  may ``not`` work.
+
+1. Download the pre-trained model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. hint::
+
+  You can also refer to `<https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html#icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_ to download the pre-trained model.
+
+  You have to install `git-lfs`_ before you continue.
+
+.. code-block:: bash
+
+  cd egs/librispeech/ASR
+
+  GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
+  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
+
+  git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
+  git lfs pull --include "data/lang_bpe_500/bpe.model"
+
+  cd ..
+
+.. note::
+
+  We download ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
+
+
+In the above code, we download the pre-trained model into the directory
+``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
+
+2. Install ncnn and pnnx
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+  # We put ncnn into $HOME/open-source/ncnn
+  # You can change it to anywhere you like
+
+  cd $HOME
+  mkdir -p open-source
+  cd open-source
+
+  git clone https://github.com/csukuangfj/ncnn
+  cd ncnn
+  git submodule update --recursive --init
+
+  # Note: We don't use "python setup.py install" or "pip install ." here
+
+  mkdir -p build-wheel
+  cd build-wheel
+
+  cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DNCNN_PYTHON=ON \
+    -DNCNN_BUILD_BENCHMARK=OFF \
+    -DNCNN_BUILD_EXAMPLES=OFF \
+    -DNCNN_BUILD_TOOLS=ON \
+  ..
+
+  make -j4
+
+  cd ..
+
+  # Note: $PWD here is $HOME/open-source/ncnn
+
+  export PYTHONPATH=$PWD/python:$PYTHONPATH
+  export PATH=$PWD/tools/pnnx/build/src:$PATH
+  export PATH=$PWD/build-wheel/tools/quantize:$PATH
+
+  # Now build pnnx
+  cd tools/pnnx
+  mkdir build
+  cd build
+  cmake ..
+  make -j4
+
+  ./src/pnnx
+
+Congratulations! You have successfully installed the following components:
+
+  - ``pnxx``, which is an executable located in
+    ``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
+    it to convert models exported by ``torch.jit.trace()``.
+  - ``ncnn2int8``, which is an executable located in
+    ``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
+    it to quantize our models to ``int8``.
+  - ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
+    in ``$HOME/open-source/ncnn/python/ncnn``.
+
+    .. note::
+
+      I am using ``Python 3.8``, so it
+      is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
+      version, say, ``Python 3.9``, the name would be
+      ``ncnn.cpython-39-x86_64-linux-gnu.so``.
+
+      Also, if you are not using Linux, the file name would also be different.
+      But that does not matter. As long as you can compile it, it should work.
+
+We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
+Python code. We have also set up ``PATH`` so that you can use
+``pnnx`` and ``ncnn2int8`` later in your terminal.
+
+.. caution::
+
+  Please don't use `<https://github.com/tencent/ncnn>`_.
+  We have made some modifications to the offical `ncnn`_.
+
+  We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
+  with the official one.
+
+3. Export the model via torch.jit.trace()
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+First, let us rename our pre-trained model:
+
+.. code-block::
+
+  cd egs/librispeech/ASR
+
+  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
+
+  ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
+
+  cd ../..
+
+Next, we use the following code to export our model:
+
+.. code-block:: bash
+
+  dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
+
+  ./conv_emformer_transducer_stateless2/export-for-ncnn.py \
+    --exp-dir $dir/exp \
+    --bpe-model $dir/data/lang_bpe_500/bpe.model \
+    --epoch 30 \
+    --avg 1 \
+    --use-averaged-model 0 \
+    \
+    --num-encoder-layers 12 \
+    --chunk-length 32 \
+    --cnn-module-kernel 31 \
+    --left-context-length 32 \
+    --right-context-length 8 \
+    --memory-size 32 \
+    --encoder-dim 512
+
+.. hint::
+
+  We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
+  There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
+
+  If you have trained a model by yourself and if you have all checkpoints
+  available, please first use ``decode.py`` to tune ``--epoch --avg``
+  and select the best combination with with ``--use-averaged-model 1``.
+
+.. note::
+
+  You will see the following log output:
+
+  .. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
+
+  The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
+
+  .. code-block::
+
+    ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
+
+    -rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
+
+  You can see that the file size of the pre-trained model is ``289 MB``, which
+  is roughly ``75490012*4/1024/1024 = 287.97 MB``.
+
+After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
+we will get the following files:
+
+.. code-block:: bash
+
+  ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
+
+  -rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
+  -rw-r--r-- 1 kuangfangjun root  283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
+  -rw-r--r-- 1 kuangfangjun root  3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
+
+
+.. _conv-emformer-step-3-export-torchscript-model-via-pnnx:
+
+3. Export torchscript model via pnnx
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. hint::
+
+  Make sure you have set up the ``PATH`` environment variable. Otherwise,
+  it will throw an error saying that ``pnnx`` could not be found.
+
+Now, it's time to export our models to `ncnn`_ via ``pnnx``.
+
+.. code-block::
+
+  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+  pnnx ./encoder_jit_trace-pnnx.pt
+  pnnx ./decoder_jit_trace-pnnx.pt
+  pnnx ./joiner_jit_trace-pnnx.pt
+
+It will generate the following files:
+
+.. code-block:: bash
+
+  ls -lh  icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
+
+  -rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
+  -rw-r--r-- 1 kuangfangjun root  437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
+  -rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
+  -rw-r--r-- 1 kuangfangjun root  79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
+  -rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
+  -rw-r--r-- 1 kuangfangjun root  488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
+
+There are two types of files:
+
+- ``param``: It is a text file containing the model architectures. You can
+  use a text editor to view its content.
+- ``bin``: It is a binary file containing the model parameters.
+
+We compare the file sizes of the models below before and after converting via ``pnnx``:
+
+.. see https://tableconvert.com/restructuredtext-generator
+
++----------------------------------+------------+
+| File name                        | File size  |
++==================================+============+
+| encoder_jit_trace-pnnx.pt        | 283 MB     |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.pt        | 1010 KB    |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.pt         | 3.0 MB     |
++----------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin  | 142 MB     |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin  | 503 KB     |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin   | 1.5 MB     |
++----------------------------------+------------+
+
+You can see that the file sizes of the models after conversion are about one half
+of the models before conversion:
+
+  - encoder: 283 MB vs 142 MB
+  - decoder: 1010 KB vs 503 KB
+  - joiner: 3.0 MB vs 1.5 MB
+
+The reason is that by default ``pnnx`` converts ``float32`` parameters
+to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
+for ``float16``. Thus, it is ``twice smaller`` after conversion.
+
+.. hint::
+
+  If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
+  won't convert ``float32`` to ``float16``.
+
+4. Test the exported models in icefall
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  We assume you have set up the environment variable ``PYTHONPATH`` when
+  building `ncnn`_.
+
+Now we have successfully converted our pre-trained model to `ncnn`_ format.
+The generated 6 files are what we need. You can use the following code to
+test the converted models:
+
+.. code-block:: bash
+
+  ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
+    --tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
+    --encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
+    --encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
+    --decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
+    --decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
+    --joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
+    --joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
+    ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
+
+.. hint::
+
+  `ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
+  only 1 wave file as input.
+
+The output is given below:
+
+.. literalinclude:: ./code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
+
+Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
+
+
+.. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
+
+5. Modify the exported encoder for sherpa-ncnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to use the exported models in `sherpa-ncnn`_, we have to modify
+``encoder_jit_trace-pnnx.ncnn.param``.
+
+Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
+
+.. code-block::
+
+  7767517
+  1060 1342
+  Input                    in0                      0 1 in0
+
+**Explanation** of the above three lines:
+
+  1. ``7767517``, it is a magic number and should not be changed.
+  2. ``1060 1342``, the first number ``1060`` specifies the number of layers
+     in this file, while ``1342`` specifies the number of intermediate outputs
+     of this file
+  3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
+     is the layer name of this layer; ``0`` means this layer has no input;
+     ``1`` means this layer has one output; ``in0`` is the output name of
+     this layer.
+
+We need to add 1 extra line and also increment the number of layers.
+The result looks like below:
+
+.. code-block:: bash
+
+  7767517
+  1061 1342
+  SherpaMetaData           sherpa_meta_data1        0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
+  Input                    in0                      0 1 in0
+
+**Explanation**
+
+  1. ``7767517``, it is still the same
+  2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
+     We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
+  3. ``SherpaMetaData  sherpa_meta_data1  0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
+     This line is newly added. Its explanation is given below:
+
+      - ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
+      - ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
+      - ``0 0`` means this layer has no inputs or output. Must be ``0 0``
+      - ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
+      - ``1=12``, 1 is the key and 12 is the value of the
+        parameter ``--num-encoder-layers`` that you provided when running
+        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+      - ``2=32``, 2 is the key and 32 is the value of the
+        parameter ``--memory-size`` that you provided when running
+        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+      - ``3=31``, 3 is the key and 31 is the value of the
+        parameter ``--cnn-module-kernel`` that you provided when running
+        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+      - ``4=8``, 4 is the key and 8 is the value of the
+        parameter ``--left-context-length`` that you provided when running
+        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+      - ``5=32``, 5 is the key and 32 is the value of the
+        parameter ``--chunk-length`` that you provided when running
+        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+      - ``6=8``, 6 is the key and 8 is the value of the
+        parameter ``--right-context-length`` that you provided when running
+        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+      - ``7=512``, 7 is the key and 512 is the value of the
+        parameter ``--encoder-dim`` that you provided when running
+        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+
+      For ease of reference, we list the key-value pairs that you need to add
+      in the following table. If your model has a different setting, please
+      change the values for ``SherpaMetaData`` accordingly. Otherwise, you
+      will be ``SAD``.
+
+          +------+-----------------------------+
+          | key  | value                       |
+          +======+=============================+
+          | 0    | 1 (fixed)                   |
+          +------+-----------------------------+
+          | 1    | ``--num-encoder-layers``    |
+          +------+-----------------------------+
+          | 2    | ``--memory-size``           |
+          +------+-----------------------------+
+          | 3    | ``--cnn-module-kernel``     |
+          +------+-----------------------------+
+          | 4    | ``--left-context-length``   |
+          +------+-----------------------------+
+          | 5    | ``--chunk-length``          |
+          +------+-----------------------------+
+          | 6    | ``--right-context-length``  |
+          +------+-----------------------------+
+          | 7    | ``--encoder-dim``           |
+          +------+-----------------------------+
+
+  4. ``Input in0 0 1 in0``. No need to change it.
+
+.. caution::
+
+  When you add a new layer ``SherpaMetaData``, please remember to update the
+  number of layers. In our case, update  ``1060`` to ``1061``. Otherwise,
+  you will be SAD later.
+
+.. hint::
+
+  After adding the new layer ``SherpaMetaData``, you cannot use this model
+  with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
+  supported only in `sherpa-ncnn`_.
+
+.. hint::
+
+  `ncnn`_ is very flexible. You can add new layers to it just by text-editing
+  the ``param`` file! You don't need to change the ``bin`` file.
+
+Now you can use this model in `sherpa-ncnn`_.
+Please refer to the following documentation:
+
+  - Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
+  - Android: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
+  - Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
+
+We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
+
+  - `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
+
+    You can find more usages there.
+
+6. (Optional) int8 quantization with sherpa-ncnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This step is optional.
+
+In this step, we describe how to quantize our model with ``int8``.
+
+Change :ref:`conv-emformer-step-3-export-torchscript-model-via-pnnx` to
+disable ``fp16`` when using ``pnnx``:
+
+.. code-block::
+
+  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+  pnnx ./encoder_jit_trace-pnnx.pt fp16=0
+  pnnx ./decoder_jit_trace-pnnx.pt
+  pnnx ./joiner_jit_trace-pnnx.pt fp16=0
+
+.. note::
+
+  We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
+  support quantizing the decoder model yet. We will update this documentation
+  once `ncnn`_ supports it. (Maybe in this year, 2023).
+
+It will generate the following files
+
+.. code-block:: bash
+
+  ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
+
+  -rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
+  -rw-r--r-- 1 kuangfangjun root  437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
+  -rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
+  -rw-r--r-- 1 kuangfangjun root  79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
+  -rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
+  -rw-r--r-- 1 kuangfangjun root  488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
+
+Let us compare again the file sizes:
+
++----------------------------------------+------------+
+| File name                              | File size  |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.pt              | 283 MB     |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.pt              | 1010 KB    |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.pt               | 3.0 MB     |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB     |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB     |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin  (fp16) | 1.5 MB     |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB     |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin  (fp32) | 3.0 MB     |
++----------------------------------------+------------+
+
+You can see that the file sizes are doubled when we disable ``fp16``.
+
+.. note::
+
+  You can again use ``streaming-ncnn-decode.py`` to test the exported models.
+
+Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
+to modify ``encoder_jit_trace-pnnx.ncnn.param``.
+
+Change
+
+.. code-block:: bash
+
+  7767517
+  1060 1342
+  Input                    in0                      0 1 in0
+
+to
+
+.. code-block:: bash
+
+  7767517
+  1061 1342
+  SherpaMetaData           sherpa_meta_data1        0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
+  Input                    in0                      0 1 in0
+
+.. caution::
+
+  Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
+  to change the values for ``SherpaMetaData`` if your model uses a different setting.
+
+
+Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
+`sherpa-ncnn`_.
+
+.. code-block:: bash
+
+  # We will download sherpa-ncnn to $HOME/open-source/
+  # You can change it to anywhere you like.
+  cd $HOME
+  mkdir -p open-source
+
+  cd open-source
+  git clone https://github.com/k2-fsa/sherpa-ncnn
+  cd sherpa-ncnn
+  mkdir build
+  cd build
+  cmake ..
+  make -j 4
+
+  ./bin/generate-int8-scale-table
+
+  export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
+
+The output of the above commands are:
+
+.. code-block:: bash
+
+  (py38) kuangfangjun:build$ generate-int8-scale-table
+  Please provide 10 arg. Currently given: 1
+  Usage:
+  generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
+
+  Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
+
+We need to create a file ``wave_filenames.txt``, in which we need to put
+some calibration wave files. For testing purpose, we put the ``test_wavs``
+from the pre-trained model repository `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
+
+.. code-block:: bash
+
+  cd egs/librispeech/ASR
+  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+  cat <<EOF > wave_filenames.txt
+  ../test_wavs/1089-134686-0001.wav
+  ../test_wavs/1221-135766-0001.wav
+  ../test_wavs/1221-135766-0002.wav
+  EOF
+
+Now we can calculate the scales needed for quantization with the calibration data:
+
+.. code-block:: bash
+
+  cd egs/librispeech/ASR
+  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+  generate-int8-scale-table \
+    ./encoder_jit_trace-pnnx.ncnn.param \
+    ./encoder_jit_trace-pnnx.ncnn.bin \
+    ./decoder_jit_trace-pnnx.ncnn.param \
+    ./decoder_jit_trace-pnnx.ncnn.bin \
+    ./joiner_jit_trace-pnnx.ncnn.param \
+    ./joiner_jit_trace-pnnx.ncnn.bin \
+    ./encoder-scale-table.txt \
+    ./joiner-scale-table.txt \
+    ./wave_filenames.txt
+
+The output logs are in the following:
+
+.. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
+
+It generates the following two files:
+
+.. code-block:: bash
+
+  $ ls -lh encoder-scale-table.txt joiner-scale-table.txt
+  -rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
+  -rw-r--r-- 1 kuangfangjun root  18K Jan 11 17:28 joiner-scale-table.txt
+
+.. caution::
+
+  Definitely, you need more calibration data to compute the scale table.
+
+Finally, let us use the scale table to quantize our models into ``int8``.
+
+.. code-block:: bash
+
+  ncnn2int8
+
+  usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
+
+First, we quantize the encoder model:
+
+.. code-block:: bash
+
+  cd egs/librispeech/ASR
+  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+  ncnn2int8 \
+    ./encoder_jit_trace-pnnx.ncnn.param \
+    ./encoder_jit_trace-pnnx.ncnn.bin \
+    ./encoder_jit_trace-pnnx.ncnn.int8.param \
+    ./encoder_jit_trace-pnnx.ncnn.int8.bin \
+    ./encoder-scale-table.txt
+
+Next, we quantize the joiner model:
+
+.. code-block:: bash
+
+  ncnn2int8 \
+    ./joiner_jit_trace-pnnx.ncnn.param \
+    ./joiner_jit_trace-pnnx.ncnn.bin \
+    ./joiner_jit_trace-pnnx.ncnn.int8.param \
+    ./joiner_jit_trace-pnnx.ncnn.int8.bin \
+    ./joiner-scale-table.txt
+
+The above two commands generate the following 4 files:
+
+.. code-block:: bash
+
+  -rw-r--r-- 1 kuangfangjun root  99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
+  -rw-r--r-- 1 kuangfangjun root  78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
+  -rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
+  -rw-r--r-- 1 kuangfangjun root  496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
+
+Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
+
+.. caution::
+
+  ``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
+
+  You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
+  and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
+
+  For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
+  replace the following invocation:
+
+    .. code-block::
+
+      cd egs/librispeech/ASR
+      cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+      sherpa-ncnn \
+        ../data/lang_bpe_500/tokens.txt \
+        ./encoder_jit_trace-pnnx.ncnn.param \
+        ./encoder_jit_trace-pnnx.ncnn.bin \
+        ./decoder_jit_trace-pnnx.ncnn.param \
+        ./decoder_jit_trace-pnnx.ncnn.bin \
+        ./joiner_jit_trace-pnnx.ncnn.param \
+        ./joiner_jit_trace-pnnx.ncnn.bin \
+        ../test_wavs/1089-134686-0001.wav
+
+  with
+
+    .. code-block::
+
+      cd egs/librispeech/ASR
+      cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+      sherpa-ncnn \
+        ../data/lang_bpe_500/tokens.txt \
+        ./encoder_jit_trace-pnnx.ncnn.int8.param \
+        ./encoder_jit_trace-pnnx.ncnn.int8.bin \
+        ./decoder_jit_trace-pnnx.ncnn.param \
+        ./decoder_jit_trace-pnnx.ncnn.bin \
+        ./joiner_jit_trace-pnnx.ncnn.param \
+        ./joiner_jit_trace-pnnx.ncnn.bin \
+        ../test_wavs/1089-134686-0001.wav
+
+
+The following table compares again the file sizes:
+
+
++----------------------------------------+------------+
+| File name                              | File size  |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.pt              | 283 MB     |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.pt              | 1010 KB    |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.pt               | 3.0 MB     |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB     |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB     |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin  (fp16) | 1.5 MB     |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB     |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin  (fp32) | 3.0 MB     |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.int8.bin   | 99 MB      |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.int8.bin    | 774 KB     |
++----------------------------------------+------------+
+
+You can see that the file sizes of the model after ``int8`` quantization
+are much smaller.
+
+.. hint::
+
+    Currently, only linear layers and convolutional layers are quantized
+    with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
+
+.. note::
+
+  You need to test the recognition accuracy after ``int8`` quantization.
+
+You can find the speed comparison at `<https://github.com/k2-fsa/sherpa-ncnn/issues/44>`_.
+
+
+That's it! Have fun with `sherpa-ncnn`_!
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
new file mode 100644
index 000000000..ea9f350cd
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
@@ -0,0 +1,223 @@
+Distillation with HuBERT
+========================
+
+This tutorial shows you how to perform knowledge distillation in `icefall`_
+with the `LibriSpeech`_ dataset. The distillation method
+used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
+Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
+for more details about MVQ-KD.
+
+.. note::
+
+    This tutorial is based on recipe
+    `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
+    Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
+    with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
+    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
+
+.. note::
+
+  We assume you have read the page :ref:`install icefall` and have setup
+  the environment for `icefall`_.
+
+.. HINT::
+
+  We recommend you to use a GPU or several GPUs to run this recipe.
+
+Data preparation
+----------------
+
+We first prepare necessary training data for `LibriSpeech`_.
+This is the same as in :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+.. hint::
+
+   The data preparation is the same as other recipes on LibriSpeech dataset,
+   if you have finished this step, you can skip to :ref:`codebook_index_preparation` directly.
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+  - ``--stage``
+  - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+For example,
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0
+  $ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5
+
+.. HINT::
+
+  If you have pre-downloaded the `LibriSpeech`_
+  dataset and the `musan`_ dataset, say,
+  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
+  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+  ``./prepare.sh`` won't re-download them.
+
+.. NOTE::
+
+  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+  are saved in ``./data`` directory.
+
+We provide the following YouTube video showing how to run ``./prepare.sh``.
+
+.. note::
+
+   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
+   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
+
+      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
+
+..  youtube:: ofEIoJL-mGM
+
+
+.. _codebook_index_preparation:
+
+Codebook index preparation
+--------------------------
+
+Here, we prepare necessary data for MVQ-KD. This requires the generation
+of codebook indexes (please read our `paper <https://arxiv.org/abs/2211.00508>`_.
+if you are interested in details). In this tutorial, we use the pre-computed
+codebook indexes for convenience. The only thing you need to do is to
+run `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_.
+
+.. note::
+
+  There are 5 stages in total, the first and second stage will be automatically skipped
+  when choosing to downloaded codebook indexes prepared by `icefall`_.
+  Of course, you can extract and compute the codebook indexes by yourself. This
+  will require you downloading a HuBERT-XL model and it can take a while for
+  the extraction of codebook indexes.
+
+
+As usual, you can control the stages you want to run by specifying the following
+two options:
+
+  - ``--stage``
+  - ``--stop-stage``
+
+For example,
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0
+  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5
+
+Here are a few options in `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_
+you need to know before you proceed.
+
+- ``--full_libri`` If True, use full 960h data. Otherwise only ``train-clean-100`` will be used
+- ``--use_extracted_codebook`` If True, the first two stages will be skipped and the codebook
+  indexes uploaded by us will be downloaded.
+
+Since we are using the pre-computed codebook indexes, we set
+``use_extracted_codebook=True``. If you want to do full `LibriSpeech`_
+experiments, please set ``full_libri=True``.
+
+The following command downloads the pre-computed codebook indexes
+and prepares MVQ-augmented training manifests.
+
+.. code-block:: bash
+
+  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2
+
+Please see the
+following screenshot for the output of an example execution.
+
+.. figure:: ./images/distillation_codebook.png
+  :width: 800
+  :alt: Downloading codebook indexes and preparing training manifest.
+  :align: center
+
+  Downloading codebook indexes and preparing training manifest.
+
+.. hint::
+
+  The codebook indexes we prepared for you in this tutorial
+  are extracted from the 36-th layer of a fine-tuned HuBERT-XL model
+  with 8 codebooks. If you want to try other configurations, please
+  set ``use_extracted_codebook=False`` and set ``embedding_layer`` and
+  ``num_codebooks`` by yourself.
+
+Now, you should see the following files under the directory ``./data/vq_fbank_layer36_cb8``.
+
+.. figure:: ./images/distillation_directory.png
+  :width: 800
+  :alt: MVQ-augmented training manifests
+  :align: center
+
+  MVQ-augmented training manifests.
+
+Whola! You are ready to perform knowledge distillation training now!
+
+Training
+--------
+
+To perform training, please run stage 3 by executing the following command.
+
+.. code-block:: bash
+
+  $ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training
+
+Here is the code snippet for training:
+
+.. code-block:: bash
+
+  WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
+
+  ./pruned_transducer_stateless6/train.py \
+    --manifest-dir ./data/vq_fbank_layer36_cb8 \
+    --master-port 12359 \
+    --full-libri $full_libri \
+    --spec-aug-time-warp-factor -1 \
+    --max-duration 300 \
+    --world-size ${WORLD_SIZE} \
+    --num-epochs 30 \
+    --exp-dir $exp_dir \
+    --enable-distillation True \
+    --codebook-loss-scale 0.01
+
+There are a few training arguments in the following
+training commands that should be paid attention to.
+
+  - ``--enable-distillation`` If True, knowledge distillation training is enabled.
+  - ``--codebook-loss-scale`` The scale of the knowledge distillation loss.
+  - ``--manifest-dir`` The path to the MVQ-augmented manifest.
+
+
+Decoding
+--------
+
+After training finished, you can test the performance on using
+the following command.
+
+.. code-block:: bash
+
+  export CUDA_VISIBLE_DEVICES=0
+  ./pruned_transducer_stateless6/train.py \
+    --decoding-method "modified_beam_search" \
+    --epoch 30 \
+    --avg 10 \
+    --max-duration 200 \
+    --exp-dir $exp_dir \
+    --enable-distillation True
+
+You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
+
+That's all! Feel free to experiment with your own setups and report your results.
+If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png
new file mode 100644
index 000000000..1a40d6c6e
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png differ
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png
new file mode 100644
index 000000000..30763046f
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png differ
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
index 3ebb36b25..bf439861a 100644
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
@@ -9,3 +9,4 @@ LibriSpeech
    pruned_transducer_stateless
    zipformer_mmi
    zipformer_ctc_blankskip
+   distillation
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
index 86d43c8fe..42fd3df77 100644
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
@@ -1,3 +1,5 @@
+.. _non_streaming_librispeech_pruned_transducer_stateless:
+
 Pruned transducer statelessX
 ============================
 
diff --git a/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst b/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst
index 643855cc2..ce8ba1453 100644
--- a/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst
@@ -515,10 +515,10 @@ To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
    Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
    for how to use the exported models in ``sherpa``.
 
-.. _export-model-for-ncnn:
+.. _export-lstm-transducer-model-for-ncnn:
 
-Export model for ncnn
-~~~~~~~~~~~~~~~~~~~~~
+Export LSTM transducer models for ncnn
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 We support exporting pretrained LSTM transducer models to
 `ncnn <https://github.com/tencent/ncnn>`_ using
@@ -531,16 +531,36 @@ First, let us install a modified version of ``ncnn``:
   git clone https://github.com/csukuangfj/ncnn
   cd ncnn
   git submodule update --recursive --init
-  python3 setup.py bdist_wheel
-  ls -lh dist/
-  pip install ./dist/*.whl
+
+  # Note: We don't use "python setup.py install" or "pip install ." here
+
+  mkdir -p build-wheel
+  cd build-wheel
+
+  cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DNCNN_PYTHON=ON \
+    -DNCNN_BUILD_BENCHMARK=OFF \
+    -DNCNN_BUILD_EXAMPLES=OFF \
+    -DNCNN_BUILD_TOOLS=ON \
+    ..
+
+  make -j4
+
+  cd ..
+
+  # Note: $PWD here is /path/to/ncnn
+
+  export PYTHONPATH=$PWD/python:$PYTHONPATH
+  export PATH=$PWD/tools/pnnx/build/src:$PATH
+  export PATH=$PWD/build-wheel/tools/quantize:$PATH
 
   # now build pnnx
   cd tools/pnnx
   mkdir build
   cd build
+  cmake ..
   make -j4
-  export PATH=$PWD/src:$PATH
 
   ./src/pnnx
 
@@ -549,6 +569,9 @@ First, let us install a modified version of ``ncnn``:
    We assume that you have added the path to the binary ``pnnx`` to the
    environment variable ``PATH``.
 
+   We also assume that you have added ``build/tools/quantize`` to the environment
+   variable ``PATH`` so that you are able to use ``ncnn2int8`` later.
+
 Second, let us export the model using ``torch.jit.trace()`` that is suitable
 for ``pnnx``:
 
@@ -634,3 +657,6 @@ by visiting the following links:
 
 You can find more usages of the pretrained models in
 `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
+
+Export ConvEmformer transducer models for ncnn
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py
index 188059044..f0c92a9b4 100644
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py
@@ -1512,24 +1512,6 @@ class EmformerEncoder(nn.Module):
             )
         return states
 
-        attn_caches = [
-            [
-                torch.zeros(self.memory_size, self.d_model, device=device),
-                torch.zeros(self.left_context_length, self.d_model, device=device),
-                torch.zeros(self.left_context_length, self.d_model, device=device),
-            ]
-            for _ in range(self.num_encoder_layers)
-        ]
-        conv_caches = [
-            torch.zeros(self.d_model, self.cnn_module_kernel - 1, device=device)
-            for _ in range(self.num_encoder_layers)
-        ]
-        states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]] = (
-            attn_caches,
-            conv_caches,
-        )
-        return states
-
 
 class Emformer(EncoderInterface):
     def __init__(
diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py
index b21fe5c7e..e4104a5bb 100755
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/streaming-ncnn-decode.py
@@ -131,6 +131,8 @@ class Model:
         encoder_net = ncnn.Net()
         encoder_net.opt.use_packing_layout = False
         encoder_net.opt.use_fp16_storage = False
+        encoder_net.opt.num_threads = 4
+
         encoder_param = args.encoder_param_filename
         encoder_model = args.encoder_bin_filename
 
@@ -144,6 +146,7 @@ class Model:
         decoder_model = args.decoder_bin_filename
 
         decoder_net = ncnn.Net()
+        decoder_net.opt.num_threads = 4
 
         decoder_net.load_param(decoder_param)
         decoder_net.load_model(decoder_model)
@@ -154,6 +157,8 @@ class Model:
         joiner_param = args.joiner_param_filename
         joiner_model = args.joiner_bin_filename
         joiner_net = ncnn.Net()
+        joiner_net.opt.num_threads = 4
+
         joiner_net.load_param(joiner_param)
         joiner_net.load_model(joiner_model)
 
@@ -176,7 +181,6 @@ class Model:
            - next_states, a list of tensors containing the next states
         """
         with self.encoder_net.create_extractor() as ex:
-            ex.set_num_threads(4)
             ex.input("in0", ncnn.Mat(x.numpy()).clone())
 
             # layer0 in2-in5
@@ -220,7 +224,6 @@ class Model:
         assert decoder_input.dtype == torch.int32
 
         with self.decoder_net.create_extractor() as ex:
-            ex.set_num_threads(4)
             ex.input("in0", ncnn.Mat(decoder_input.numpy()).clone())
             ret, ncnn_out0 = ex.extract("out0")
             assert ret == 0, ret
@@ -229,7 +232,6 @@ class Model:
 
     def run_joiner(self, encoder_out, decoder_out):
         with self.joiner_net.create_extractor() as ex:
-            ex.set_num_threads(4)
             ex.input("in0", ncnn.Mat(encoder_out.numpy()).clone())
             ex.input("in1", ncnn.Mat(decoder_out.numpy()).clone())
             ret, ncnn_out0 = ex.extract("out0")
diff --git a/egs/librispeech/ASR/distillation_with_hubert.sh b/egs/librispeech/ASR/distillation_with_hubert.sh
index a38cf590c..6aaa0333b 100755
--- a/egs/librispeech/ASR/distillation_with_hubert.sh
+++ b/egs/librispeech/ASR/distillation_with_hubert.sh
@@ -150,7 +150,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
     num_codebooks=8
 
     mkdir -p $exp_dir/vq
-    codebook_dir=$exp_dir/vq/${teacher_model_id}_layer${embedding_layer}_cb${num_codebooks}
+    codebook_dir=$exp_dir/vq/${teacher_model_id}
     mkdir -p codebook_dir
     codebook_download_dir=$exp_dir/download_codebook
     if [ -d $codebook_download_dir ]; then
@@ -180,9 +180,9 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   ./pruned_transducer_stateless6/extract_codebook_index.py \
     --full-libri $full_libri \
     --exp-dir $exp_dir \
-    --embedding-layer 36 \
+    --embedding-layer $embedding_layer \
     --num-utts 1000 \
-    --num-codebooks 8 \
+    --num-codebooks $num_codebooks \
     --max-duration 100 \
     --teacher-model-id $teacher_model_id \
     --use-extracted-codebook $use_extracted_codebook
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py
index 3b471fa85..3bd1b0a09 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/ncnn-decode.py
@@ -104,6 +104,8 @@ class Model:
         encoder_net = ncnn.Net()
         encoder_net.opt.use_packing_layout = False
         encoder_net.opt.use_fp16_storage = False
+        encoder_net.opt.num_threads = 4
+
         encoder_param = args.encoder_param_filename
         encoder_model = args.encoder_bin_filename
 
@@ -118,6 +120,7 @@ class Model:
 
         decoder_net = ncnn.Net()
         decoder_net.opt.use_packing_layout = False
+        decoder_net.opt.num_threads = 4
 
         decoder_net.load_param(decoder_param)
         decoder_net.load_model(decoder_model)
@@ -129,6 +132,8 @@ class Model:
         joiner_model = args.joiner_bin_filename
         joiner_net = ncnn.Net()
         joiner_net.opt.use_packing_layout = False
+        joiner_net.opt.num_threads = 4
+
         joiner_net.load_param(joiner_param)
         joiner_net.load_model(joiner_model)
 
@@ -136,7 +141,6 @@ class Model:
 
     def run_encoder(self, x, states):
         with self.encoder_net.create_extractor() as ex:
-            ex.set_num_threads(10)
             ex.input("in0", ncnn.Mat(x.numpy()).clone())
             x_lens = torch.tensor([x.size(0)], dtype=torch.float32)
             ex.input("in1", ncnn.Mat(x_lens.numpy()).clone())
@@ -165,7 +169,6 @@ class Model:
         assert decoder_input.dtype == torch.int32
 
         with self.decoder_net.create_extractor() as ex:
-            ex.set_num_threads(10)
             ex.input("in0", ncnn.Mat(decoder_input.numpy()).clone())
             ret, ncnn_out0 = ex.extract("out0")
             assert ret == 0, ret
@@ -174,7 +177,6 @@ class Model:
 
     def run_joiner(self, encoder_out, decoder_out):
         with self.joiner_net.create_extractor() as ex:
-            ex.set_num_threads(10)
             ex.input("in0", ncnn.Mat(encoder_out.numpy()).clone())
             ex.input("in1", ncnn.Mat(decoder_out.numpy()).clone())
             ret, ncnn_out0 = ex.extract("out0")
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py
index baff15ea6..02ed16a8c 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/streaming-ncnn-decode.py
@@ -92,6 +92,8 @@ class Model:
         encoder_net = ncnn.Net()
         encoder_net.opt.use_packing_layout = False
         encoder_net.opt.use_fp16_storage = False
+        encoder_net.opt.num_threads = 4
+
         encoder_param = args.encoder_param_filename
         encoder_model = args.encoder_bin_filename
 
@@ -106,6 +108,7 @@ class Model:
 
         decoder_net = ncnn.Net()
         decoder_net.opt.use_packing_layout = False
+        decoder_net.opt.num_threads = 4
 
         decoder_net.load_param(decoder_param)
         decoder_net.load_model(decoder_model)
@@ -117,6 +120,8 @@ class Model:
         joiner_model = args.joiner_bin_filename
         joiner_net = ncnn.Net()
         joiner_net.opt.use_packing_layout = False
+        joiner_net.opt.num_threads = 4
+
         joiner_net.load_param(joiner_param)
         joiner_net.load_model(joiner_model)
 
@@ -124,7 +129,6 @@ class Model:
 
     def run_encoder(self, x, states):
         with self.encoder_net.create_extractor() as ex:
-            #  ex.set_num_threads(10)
             ex.input("in0", ncnn.Mat(x.numpy()).clone())
             x_lens = torch.tensor([x.size(0)], dtype=torch.float32)
             ex.input("in1", ncnn.Mat(x_lens.numpy()).clone())
@@ -153,7 +157,6 @@ class Model:
         assert decoder_input.dtype == torch.int32
 
         with self.decoder_net.create_extractor() as ex:
-            #  ex.set_num_threads(10)
             ex.input("in0", ncnn.Mat(decoder_input.numpy()).clone())
             ret, ncnn_out0 = ex.extract("out0")
             assert ret == 0, ret
@@ -162,7 +165,6 @@ class Model:
 
     def run_joiner(self, encoder_out, decoder_out):
         with self.joiner_net.create_extractor() as ex:
-            #  ex.set_num_threads(10)
             ex.input("in0", ncnn.Mat(encoder_out.numpy()).clone())
             ex.input("in1", ncnn.Mat(decoder_out.numpy()).clone())
             ret, ncnn_out0 = ex.extract("out0")
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py
index 9c23e7d66..4b373e4c7 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py
@@ -44,7 +44,7 @@ Usage:
     --exp-dir ./pruned_transducer_stateless7_ctc/exp \
     --max-duration 600 \
     --hlg-scale 0.8 \
-    --decoding-method 1best
+    --decoding-method nbest
 
 (4) nbest-rescoring
 ./pruned_transducer_stateless7_ctc/ctc_decode.py \
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py
index 0ef733226..f137485b2 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_decode.py
@@ -42,7 +42,7 @@ Usage:
     --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
     --max-duration 600 \
     --hlg-scale 0.8 \
-    --decoding-method 1best
+    --decoding-method nbest
 (4) nbest-rescoring
 ./pruned_transducer_stateless7_ctc_bs/ctc_decode.py \
     --epoch 30 \