From 67a922737c3b076f646cfa746290a7204590befc Mon Sep 17 00:00:00 2001
From: yaozengwei <yaozengwei@users.noreply.github.com>
Date: Fri, 30 Dec 2022 07:30:08 +0000
Subject: [PATCH] deploy: 67ae5fdf2bf2b09d2ce9e5acb7dab12b2d2fc441

---
 .../Streaming-ASR/librispeech/index.rst.txt   |   2 +
 .../librispeech/zipformer_transducer.rst.txt  | 654 +++++++++++++++
 contributing/index.html                       |   4 +-
 objects.inv                                   | Bin 1071 -> 1084 bytes
 recipes/Streaming-ASR/index.html              |   1 +
 recipes/Streaming-ASR/librispeech/index.html  |   2 +
 .../lstm_pruned_stateless_transducer.html     |   5 +-
 .../pruned_transducer_stateless.html          |   1 +
 .../librispeech/zipformer_transducer.html     | 752 ++++++++++++++++++
 searchindex.js                                |   2 +-
 10 files changed, 1418 insertions(+), 5 deletions(-)
 create mode 100644 _sources/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst.txt
 create mode 100644 recipes/Streaming-ASR/librispeech/zipformer_transducer.html

diff --git a/_sources/recipes/Streaming-ASR/librispeech/index.rst.txt b/_sources/recipes/Streaming-ASR/librispeech/index.rst.txt
index 546ce168b..d52e08058 100644
--- a/_sources/recipes/Streaming-ASR/librispeech/index.rst.txt
+++ b/_sources/recipes/Streaming-ASR/librispeech/index.rst.txt
@@ -7,3 +7,5 @@ LibriSpeech
    pruned_transducer_stateless
 
    lstm_pruned_stateless_transducer
+
+   zipformer_transducer
diff --git a/_sources/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst.txt b/_sources/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst.txt
new file mode 100644
index 000000000..f0e8961d7
--- /dev/null
+++ b/_sources/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst.txt
@@ -0,0 +1,654 @@
+Zipformer Transducer
+====================
+
+This tutorial shows you how to run a **streaming** zipformer transducer model
+with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
+
+.. Note::
+
+   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+
+.. HINT::
+
+  We assume you have read the page :ref:`install icefall` and have setup
+  the environment for ``icefall``.
+
+.. HINT::
+
+  We recommend you to use a GPU or several GPUs to run this recipe.
+
+.. hint::
+
+   Please scroll down to the bottom of this page to find download links
+   for pretrained models if you don't want to train a model from scratch.
+
+
+We use pruned RNN-T to compute the loss.
+
+.. note::
+
+   You can find the paper about pruned RNN-T at the following address:
+
+   `<https://arxiv.org/abs/2206.13236>`_
+
+The transducer model consists of 3 parts:
+
+  - Encoder, a.k.a, the transcription network. We use a Zipformer model (proposed by Daniel Povey)
+  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
+    ``nn.Embedding`` and ``nn.Conv1d``
+  - Joiner, a.k.a, the joint network.
+
+.. caution::
+
+   Contrary to the conventional RNN-T models, we use a stateless decoder.
+   That is, it has no recurrent connections.
+
+
+Data preparation
+----------------
+
+.. hint::
+
+   The data preparation is the same as other recipes on LibriSpeech dataset,
+   if you have finished this step, you can skip to ``Training`` directly.
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+  - ``--stage``
+  - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+
+For example,
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./prepare.sh --stage 0 --stop-stage 0
+
+means to run only stage 0.
+
+To run stage 2 to stage 5, use:
+
+.. code-block:: bash
+
+  $ ./prepare.sh --stage 2 --stop-stage 5
+
+.. HINT::
+
+  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
+  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
+  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
+  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+  ``./prepare.sh`` won't re-download them.
+
+.. NOTE::
+
+  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+  are saved in ``./data`` directory.
+
+We provide the following YouTube video showing how to run ``./prepare.sh``.
+
+.. note::
+
+   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
+   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
+
+      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
+
+..  youtube:: ofEIoJL-mGM
+
+
+Training
+--------
+
+Configurable options
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./pruned_transducer_stateless7_streaming/train.py --help
+
+
+shows you the training options that can be passed from the commandline.
+The following options are used quite often:
+
+  - ``--exp-dir``
+
+    The directory to save checkpoints, training logs and tensorboard.
+
+  - ``--full-libri``
+
+    If it's True, the training part uses all the training data, i.e.,
+    960 hours. Otherwise, the training part uses only the subset
+    ``train-clean-100``, which has 100 hours of training data.
+
+    .. CAUTION::
+      The training set is perturbed by speed with two factors: 0.9 and 1.1.
+      If ``--full-libri`` is True, each epoch actually processes
+      ``3x960 == 2880`` hours of data.
+
+  - ``--num-epochs``
+
+    It is the number of epochs to train. For instance,
+    ``./pruned_transducer_stateless7_streaming/train.py --num-epochs 30`` trains for 30 epochs
+    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
+    in the folder ``./pruned_transducer_stateless7_streaming/exp``.
+
+  - ``--start-epoch``
+
+    It's used to resume training.
+    ``./pruned_transducer_stateless7_streaming/train.py --start-epoch 10`` loads the
+    checkpoint ``./pruned_transducer_stateless7_streaming/exp/epoch-9.pt`` and starts
+    training from epoch 10, based on the state from epoch 9.
+
+  - ``--world-size``
+
+    It is used for multi-GPU single-machine DDP training.
+
+      - (a) If it is 1, then no DDP training is used.
+
+      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
+
+    The following shows some use cases with it.
+
+      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
+      GPU 2 for training. You can do the following:
+
+        .. code-block:: bash
+
+          $ cd egs/librispeech/ASR
+          $ export CUDA_VISIBLE_DEVICES="0,2"
+          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 2
+
+      **Use case 2**: You have 4 GPUs and you want to use all of them
+      for training. You can do the following:
+
+        .. code-block:: bash
+
+          $ cd egs/librispeech/ASR
+          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 4
+
+      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
+      for training. You can do the following:
+
+        .. code-block:: bash
+
+          $ cd egs/librispeech/ASR
+          $ export CUDA_VISIBLE_DEVICES="3"
+          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 1
+
+    .. caution::
+
+      Only multi-GPU single-machine DDP training is implemented at present.
+      Multi-GPU multi-machine DDP training will be added later.
+
+  - ``--max-duration``
+
+    It specifies the number of seconds over all utterances in a
+    batch, before **padding**.
+    If you encounter CUDA OOM, please reduce it.
+
+    .. HINT::
+
+      Due to padding, the number of seconds of all utterances in a
+      batch will usually be larger than ``--max-duration``.
+
+      A larger value for ``--max-duration`` may cause OOM during training,
+      while a smaller value may increase the training time. You have to
+      tune it.
+
+  - ``--use-fp16``
+
+    If it is True, the model will train with half precision, from our experiment
+    results, by using half precision you can train with two times larger ``--max-duration``
+    so as to get almost 2X speed up.
+
+    We recommend using ``--use-fp16 True``.
+
+  - ``--short-chunk-size``
+
+    When training a streaming attention model with chunk masking, the chunk size
+    would be either max sequence length of current batch or uniformly sampled from
+    (1, short_chunk_size). The default value is 50, you don't have to change it most of the time.
+
+  - ``--num-left-chunks``
+
+    It indicates how many left context (in chunks) that can be seen when calculating attention.
+    The default value is 4, you don't have to change it most of the time.
+
+
+  - ``--decode-chunk-len``
+
+    The chunk size for decoding (in frames before subsampling). It is used for validation.
+    The default value is 32 (i.e., 320ms).
+
+
+Pre-configured options
+~~~~~~~~~~~~~~~~~~~~~~
+
+There are some training options, e.g., number of encoder layers,
+encoder dimension, decoder dimension, number of warmup steps etc,
+that are not passed from the commandline.
+They are pre-configured by the function ``get_params()`` in
+`pruned_transducer_stateless7_streaming/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py>`_
+
+You don't need to change these pre-configured parameters. If you really need to change
+them, please modify ``./pruned_transducer_stateless7_streaming/train.py`` directly.
+
+
+Training logs
+~~~~~~~~~~~~~
+
+Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless7_streaming/exp``.
+You will find the following files in that directory:
+
+  - ``epoch-1.pt``, ``epoch-2.pt``, ...
+
+    These are checkpoint files saved at the end of each epoch, containing model
+    ``state_dict`` and optimizer ``state_dict``.
+    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+      .. code-block:: bash
+
+        $ ./pruned_transducer_stateless7_streaming/train.py --start-epoch 11
+
+  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
+
+    These are checkpoint files saved every ``--save-every-n`` batches,
+    containing model ``state_dict`` and optimizer ``state_dict``.
+    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
+
+      .. code-block:: bash
+
+        $ ./pruned_transducer_stateless7_streaming/train.py --start-batch 436000
+
+  - ``tensorboard/``
+
+    This folder contains tensorBoard logs. Training loss, validation loss, learning
+    rate, etc, are recorded in these logs. You can visualize them by:
+
+      .. code-block:: bash
+
+        $ cd pruned_transducer_stateless7_streaming/exp/tensorboard
+        $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
+
+  .. hint::
+
+    If you don't have access to google, you can use the following command
+    to view the tensorboard log locally:
+
+      .. code-block:: bash
+
+        cd pruned_transducer_stateless7_streaming/exp/tensorboard
+        tensorboard --logdir . --port 6008
+
+    It will print the following message:
+
+      .. code-block::
+
+        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
+        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
+
+    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
+    logs.
+
+
+  - ``log/log-train-xxxx``
+
+    It is the detailed training log in text format, same as the one
+    you saw printed to the console during training.
+
+Usage example
+~~~~~~~~~~~~~
+
+You can use the following command to start the training using 4 GPUs:
+
+.. code-block:: bash
+
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./pruned_transducer_stateless7_streaming/train.py \
+    --world-size 4 \
+    --num-epochs 30 \
+    --start-epoch 1 \
+    --use-fp16 1 \
+    --exp-dir pruned_transducer_stateless7_streaming/exp \
+    --full-libri 1 \
+    --max-duration 550
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+.. hint::
+
+   There are two kinds of checkpoints:
+
+    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
+      of each epoch. You can pass ``--epoch`` to
+      ``pruned_transducer_stateless7_streaming/decode.py`` to use them.
+
+    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
+      every ``--save-every-n`` batches. You can pass ``--iter`` to
+      ``pruned_transducer_stateless7_streaming/decode.py`` to use them.
+
+    We suggest that you try both types of checkpoints and choose the one
+    that produces the lowest WERs.
+
+.. tip::
+
+    To decode a streaming model, you can use either ``simulate streaming decoding`` in ``decode.py`` or
+    ``real chunk-wise streaming decoding`` in ``streaming_decode.py``. The difference between ``decode.py`` and
+    ``streaming_decode.py`` is that, ``decode.py`` processes the whole acoustic frames at one time with masking (i.e. same as training),
+    but ``streaming_decode.py`` processes the acoustic frames chunk by chunk.
+
+.. NOTE::
+
+   ``simulate streaming decoding`` in ``decode.py`` and ``real chunk-size streaming decoding`` in ``streaming_decode.py`` should
+   produce almost the same results given the same ``--decode-chunk-len``.
+
+
+Simulate streaming decoding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./pruned_transducer_stateless7_streaming/decode.py --help
+
+shows the options for decoding.
+The following options are important for streaming models:
+
+  ``--decode-chunk-len``
+
+    It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
+    The default value is 32 (i.e., 320ms).
+
+
+The following shows two examples (for the two types of checkpoints):
+
+.. code-block:: bash
+
+  for m in greedy_search fast_beam_search modified_beam_search; do
+    for epoch in 30; do
+      for avg in 12 11 10 9 8; do
+        ./pruned_transducer_stateless7_streaming/decode.py \
+          --epoch $epoch \
+          --avg $avg \
+          --decode-chunk-len 32 \
+          --exp-dir pruned_transducer_stateless7_streaming/exp \
+          --max-duration 600 \
+          --decoding-method $m
+      done
+    done
+  done
+
+
+.. code-block:: bash
+
+  for m in greedy_search fast_beam_search modified_beam_search; do
+    for iter in 474000; do
+      for avg in 8 10 12 14 16 18; do
+        ./pruned_transducer_stateless7_streaming/decode.py \
+          --iter $iter \
+          --avg $avg \
+          --decode-chunk-len 32 \
+          --exp-dir pruned_transducer_stateless7_streaming/exp \
+          --max-duration 600 \
+          --decoding-method $m
+      done
+    done
+  done
+
+
+Real streaming decoding
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./pruned_transducer_stateless7_streaming/streaming_decode.py --help
+
+shows the options for decoding.
+The following options are important for streaming models:
+
+  ``--decode-chunk-len``
+
+    It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
+    The default value is 32 (i.e., 320ms).
+    For ``real streaming decoding``, we will process ``decode-chunk-len`` acoustic frames at each time.
+
+  ``--num-decode-streams``
+
+    The number of decoding streams that can be run in parallel (very similar to the ``bath size``).
+    For ``real streaming decoding``, the batches will be packed dynamically, for example, if the
+    ``num-decode-streams`` equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
+    suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.
+
+
+The following shows two examples (for the two types of checkpoints):
+
+.. code-block:: bash
+
+  for m in greedy_search fast_beam_search modified_beam_search; do
+    for epoch in 30; do
+      for avg in 12 11 10 9 8; do
+        ./pruned_transducer_stateless7_streaming/decode.py \
+          --epoch $epoch \
+          --avg $avg \
+          --decode-chunk-len 32 \
+          --num-decode-streams 100 \
+          --exp-dir pruned_transducer_stateless7_streaming/exp \
+          --decoding-method $m
+      done
+    done
+  done
+
+
+.. code-block:: bash
+
+  for m in greedy_search fast_beam_search modified_beam_search; do
+    for iter in 474000; do
+      for avg in 8 10 12 14 16 18; do
+        ./pruned_transducer_stateless7_streaming/decode.py \
+          --iter $iter \
+          --avg $avg \
+          --decode-chunk-len 16 \
+          --num-decode-streams 100 \
+          --exp-dir pruned_transducer_stateless7_streaming/exp \
+          --decoding-method $m
+      done
+    done
+  done
+
+
+.. tip::
+
+  Supporting decoding methods are as follows:
+
+    - ``greedy_search`` : It takes the symbol with largest posterior probability
+      of each frame as the decoding result.
+
+    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
+      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
+      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
+      next frame.
+
+    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
+      runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
+
+    - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
+      given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
+      our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
+
+    - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
+      an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
+      (with N-gram LM).
+
+    - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
+
+      - (1) Use ``fast_beam_search`` to get a lattice
+      - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
+      - (3) Unique the selected paths
+      - (4) Intersect the selected paths with the lattice and compute the
+            shortest path from the intersection result
+      - (5) The path with the largest score is used as the decoding output.
+
+    - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
+      only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
+
+.. NOTE::
+
+  The supporting decoding methods in ``streaming_decode.py`` might be less than that in ``decode.py``, if needed,
+  you can implement them by yourself or file a issue in `icefall <https://github.com/k2-fsa/icefall/issues>`_ .
+
+
+Export Model
+------------
+
+Currently it supports exporting checkpoints from ``pruned_transducer_stateless7_streaming/exp`` in the following ways.
+
+Export ``model.state_dict()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Checkpoints saved by ``pruned_transducer_stateless7_streaming/train.py`` also include
+``optimizer.state_dict()``. It is useful for resuming training. But after training,
+we are interested only in ``model.state_dict()``. You can use the following
+command to extract ``model.state_dict()``.
+
+.. code-block:: bash
+
+  # Assume that --epoch 30 --avg 9 produces the smallest WER
+  # (You can get such information after running ./pruned_transducer_stateless7_streaming/decode.py)
+
+  epoch=30
+  avg=9
+
+  ./pruned_transducer_stateless7_streaming/export.py \
+    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --epoch $epoch \
+    --avg  $avg \
+    --use-averaged-model=True \
+    --decode-chunk-len 32
+
+It will generate a file ``./pruned_transducer_stateless7_streaming/exp/pretrained.pt``.
+
+.. hint::
+
+   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_streaming/decode.py``,
+   you can run:
+
+   .. code-block:: bash
+
+      cd pruned_transducer_stateless7_streaming/exp
+      ln -s pretrained.pt epoch-999.pt
+
+   And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
+   ``./pruned_transducer_stateless7_streaming/decode.py``.
+
+To use the exported model with ``./pruned_transducer_stateless7_streaming/pretrained.py``, you
+can run:
+
+.. code-block:: bash
+
+  ./pruned_transducer_stateless7_streaming/pretrained.py \
+    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method greedy_search \
+    --decode-chunk-len 32 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+
+Export model using ``torch.jit.script()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  ./pruned_transducer_stateless7_streaming/export.py \
+    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --epoch 30 \
+    --avg 9 \
+    --decode-chunk-len 32 \
+    --jit 1
+
+.. caution::
+
+   ``--decode-chunk-len`` is required to export a ScriptModule.
+
+It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
+load it by ``torch.jit.load("cpu_jit.pt")``.
+
+Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
+are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
+
+Export model using ``torch.jit.trace()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  epoch=30
+  avg=9
+
+  ./pruned_transducer_stateless7_streaming/jit_trace_export.py \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --use-averaged-model=True \
+    --decode-chunk-len 32 \
+    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
+    --epoch $epoch \
+    --avg $avg
+
+.. caution::
+
+   ``--decode-chunk-len`` is required to export a ScriptModule.
+
+It will generate 3 files:
+
+  - ``./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt``
+  - ``./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt``
+  - ``./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt``
+
+To use the generated files with ``./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py``:
+
+.. code-block:: bash
+
+  ./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
+    --encoder-model-filename ./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt \
+    --decoder-model-filename ./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt \
+    --joiner-model-filename ./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --decode-chunk-len 32 \
+    /path/to/foo.wav
+
+
+Download pretrained models
+--------------------------
+
+If you don't want to train from scratch, you can download the pretrained models
+by visiting the following links:
+
+  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
+
+  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
+  for the details of the above pretrained models
+
+Deploy with Sherpa
+------------------
+
+Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#>`_
+for how to deploy the models in ``sherpa``.
diff --git a/contributing/index.html b/contributing/index.html
index 791aa9755..d46281c54 100644
--- a/contributing/index.html
+++ b/contributing/index.html
@@ -21,7 +21,7 @@
     <link rel="index" title="Index" href="../genindex.html" />
     <link rel="search" title="Search" href="../search.html" />
     <link rel="next" title="Contributing to Documentation" href="doc.html" />
-    <link rel="prev" title="LSTM Transducer" href="../recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.html" /> 
+    <link rel="prev" title="Zipformer Transducer" href="../recipes/Streaming-ASR/librispeech/zipformer_transducer.html" /> 
 </head>
 
 <body class="wy-body-for-nav"> 
@@ -124,7 +124,7 @@ and code to <code class="docutils literal notranslate"><span class="pre">icefall
            </div>
           </div>
           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
-        <a href="../recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.html" class="btn btn-neutral float-left" title="LSTM Transducer" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
+        <a href="../recipes/Streaming-ASR/librispeech/zipformer_transducer.html" class="btn btn-neutral float-left" title="Zipformer Transducer" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
         <a href="doc.html" class="btn btn-neutral float-right" title="Contributing to Documentation" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
     </div>
 
diff --git a/objects.inv b/objects.inv
index 05008f1d174facebc9f94e278de34d88fcb91e97..151d942ddfb0bd434f3785348043ceae78d8fe7a 100644
GIT binary patch
delta 979
zcmV;^11$Wn2)qc8cYjT98zB(A@2@~|5AA_iz4q3oUzTb&%DPh8i?zYTOT)ry!AjC!
z-vQP(cv%~g_F~!d@#Zo6b^sG8bjNPDnu*Pn$Th`EKXHnbUcXyQg78~J;vpBcZaDfZ
zIhPMe*Ay8SvaTXXF)(9rU~@Q^o)u|{K1#A}sL+*WQiOQ5e1E`NV$#t{Q(WPW64s70
zj)%LdUk!DMoDWsV`NDEctm)$@y7dm1P|(L#b{aR(6OQj>hXoPBqVVdLa)aZFscEU9
zd;>}NeR)0Q?6za7rIgfDu5`0%yRD$>m4a4KPL*2et`cg!CA713C$G>aTMXG^2#O(7
zETr(jbPa9jNPmrgGmRDLSgS)mF3@F7`{Za^w5*9lHu3@@FI?p7Hz!%ZN(jv+VY5;=
z*g&9g7t?51_LRmaE+0ca_TG@V@;6i*1ha?VS)EX3=4(-JHyfyS2ONhQ8#sHt9?U_0
z+fi671D&`=t1+xWpu@fDDDLg9l`4Q&Eqsn*I`4oWa)0(>ur|LFgx*!0Lk!59bHv>5
zuz9`HP6DqPlirSZNi#357$$`jqL3j(Op4BC-$lG;MEjsA@ve+7+6(Jk87D|JhOwU<
zC+e7(02`*79RjgnDa7MBKofHe_<gasDUzqexeDW>C2g>zPKrz(hv||uTLQ#^&W2ux
zJhza}Zhs&mBcpx7JI6j}3uDI(56^oh?dnO_QiR#Noo68hXw%<)DyC}6IgCI7`$$8(
z716=&ote7d8J!7TE-w?R5Bivx;;QwgggUW3HLvXGu8hi^2QBxW1WI}>#LDjb*^LiU
z9f{?~#R7jTm-91?&uF;UXmFza;RS*7L<Rp?^M4F+ji%=b%!kQX^B~Pt{btOiH7C~W
z&vFN?9FBGQOiqC9c6Aeg0&VvJH;IIwA;;15BT3@)NA-ubhbO+e<(0Ts_pHt5GXmt8
z{n|N=h|mY)L?w8$Nj4382{lMRCN!VVPO#9dVLC_8UPAv+X7kzdYz6l;*{0j3t09)@
z3x9!qx&CoR2|h%Y5+jLu8hoNk$ehM{39iylsuuEW1w%2XfQk23;xD31o9pG+Opnj=
z<8a-<ae|mratin%dUwh`Sc>zv;KW5_4C&|JxbtWv-QSVjpCgBa@?UKu<$vBn%7mfz
z+!n?Vj;CX!ccYr|lyU7$g`yRl>ceBSvrP4`g!5lj8(JKVz^yX>8nwr_=lK6T8N+d6
zY}nc`*f7+=753`5i)C<cw|9yn9FzZ$pFco*dSJ?AXUEq|Ssl&5iT#~H;x8P?w7-Fg
B@Y(<X

delta 966
zcmV;%13CP>2(Ji`cYj@P+aMHu_pd<89<~P>?R9Tk_Z4Z$sydUpy+kgdF)Iek5VhT}
z-vyimVw^y`y;RNpIOl**E;vLA-Lc!PW@0lXa!s+)Pn;s9*YDPnAp91Qc*#Yr8;(9p
z&gBErHAN<cY^n%a49plD#2n707e!j4kCJQ~Ds-ip6d_(MAAhixm~^z#6j!*Tgtg;}
z<K>R^tC22|%b^OnTu8^nnm&%A+w5=&1$}H~r*Q*&!ttH#upmNM6kgp@Zg3o#nwA>q
z8)(As%j+p;w;fY0rKFy6rJGgTZ3SJg6l?|MRH>EjDxua}LOaVld4)b%GGxgRkRd}B
zN_b$phHdCbjembLjTPxwt3y4m&~;7w=4e}#)>I-}c>$FduJZMplPX{(gl3DdT`3xD
zAxOB3c{HRw<?)H@$Iy?xZ%Ewv8!8Tl*~9Om&L}hUZ&7bI8(_NwjYE$OlD%IK;h?|m
zC@hwNPuxbUF|0wM!@cS#?&EGNRRFJA_&bW}vIBw0#ea*z+Weg$^seF@YQWx{Bj!Gb
zo7X4pB=9z4(#P?xY39`x!=_L|6f%T}S<%JpcM-1{(LQKOy`%9>dts9+lLV{AF!qz<
z1dpi+uwkm%ArLE;LOh-WG&RS7-xrIUB6&)jV;CPTX@ezoQe^5lOqZnD5+DwIHq1KY
zxrKCb1Ah@28SN9^CH6gA7&~rwc-b>;S5LZ@BJAFsJPRqnoBrujF;!E}VFU^|M;h9#
zhz?Hg%+~$M=tAgnd6~dI*vG^c$JU1ucw%R29_{F<jLKaGE%%uONqQ~B%AWh#8y~bf
z5zCK@1^!kp=VvxPW5d0Th9ufwUJxWtF!<M+XMdP$v^~#YK1{}%2W^h^n+caToXFYV
z<qlpsT<h|coB-RM>ZSk%-tG%-5(PiQj-%~Imc;3g>JMuVPkeRDD{-&xS(~qC1js4-
zwM!Zkp${gBO2}r5Y#R0wY0!QwXg;5vP@!4FbdI0BhW??<=CkEl2KO}CrrV~wA=c>|
zfq#9u{&B_$K17xiBaL|;e4<LooQ?Gw9Mey#7V<2Ek(fhZ>b;ftizw6PdO0@J<MaHu
zTz7DtAm)~w0)D98-Lemn;{021;vzDJ^z+}i^JFBwzax7-M=lBFzdA<B|9pguJ|OQg
zT)R-AXa!g3@XqRDz5iVPs@l-vXx?m<`9F!KJ$_Y<|EQ8Ni6$n7Z45&U0}rmS$K&4O
of|q1_fiJ?D^bhs<)2f#Tp-gplR=iZz(ZHG5&lw{A0=Suc(*L>QfdBvi

diff --git a/recipes/Streaming-ASR/index.html b/recipes/Streaming-ASR/index.html
index ad0a4d89f..b6152166e 100644
--- a/recipes/Streaming-ASR/index.html
+++ b/recipes/Streaming-ASR/index.html
@@ -97,6 +97,7 @@
 <li class="toctree-l1"><a class="reference internal" href="librispeech/index.html">LibriSpeech</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="librispeech/pruned_transducer_stateless.html">Pruned transducer statelessX</a></li>
 <li class="toctree-l2"><a class="reference internal" href="librispeech/lstm_pruned_stateless_transducer.html">LSTM Transducer</a></li>
+<li class="toctree-l2"><a class="reference internal" href="librispeech/zipformer_transducer.html">Zipformer Transducer</a></li>
 </ul>
 </li>
 </ul>
diff --git a/recipes/Streaming-ASR/librispeech/index.html b/recipes/Streaming-ASR/librispeech/index.html
index 1dc46de25..5fd2cc844 100644
--- a/recipes/Streaming-ASR/librispeech/index.html
+++ b/recipes/Streaming-ASR/librispeech/index.html
@@ -52,6 +52,7 @@
 <li class="toctree-l3 current"><a class="current reference internal" href="#">LibriSpeech</a><ul>
 <li class="toctree-l4"><a class="reference internal" href="pruned_transducer_stateless.html">Pruned transducer statelessX</a></li>
 <li class="toctree-l4"><a class="reference internal" href="lstm_pruned_stateless_transducer.html">LSTM Transducer</a></li>
+<li class="toctree-l4"><a class="reference internal" href="zipformer_transducer.html">Zipformer Transducer</a></li>
 </ul>
 </li>
 </ul>
@@ -96,6 +97,7 @@
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="pruned_transducer_stateless.html">Pruned transducer statelessX</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lstm_pruned_stateless_transducer.html">LSTM Transducer</a></li>
+<li class="toctree-l1"><a class="reference internal" href="zipformer_transducer.html">Zipformer Transducer</a></li>
 </ul>
 </div>
 </section>
diff --git a/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.html b/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.html
index c4be5ea2b..d6e18937a 100644
--- a/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.html
+++ b/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.html
@@ -20,7 +20,7 @@
     <script src="../../../_static/js/theme.js"></script>
     <link rel="index" title="Index" href="../../../genindex.html" />
     <link rel="search" title="Search" href="../../../search.html" />
-    <link rel="next" title="Contributing" href="../../../contributing/index.html" />
+    <link rel="next" title="Zipformer Transducer" href="zipformer_transducer.html" />
     <link rel="prev" title="Pruned transducer statelessX" href="pruned_transducer_stateless.html" /> 
 </head>
 
@@ -52,6 +52,7 @@
 <li class="toctree-l3 current"><a class="reference internal" href="index.html">LibriSpeech</a><ul class="current">
 <li class="toctree-l4"><a class="reference internal" href="pruned_transducer_stateless.html">Pruned transducer statelessX</a></li>
 <li class="toctree-l4 current"><a class="current reference internal" href="#">LSTM Transducer</a></li>
+<li class="toctree-l4"><a class="reference internal" href="zipformer_transducer.html">Zipformer Transducer</a></li>
 </ul>
 </li>
 </ul>
@@ -699,7 +700,7 @@ for the details of the above pretrained models</p>
           </div>
           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
         <a href="pruned_transducer_stateless.html" class="btn btn-neutral float-left" title="Pruned transducer statelessX" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
-        <a href="../../../contributing/index.html" class="btn btn-neutral float-right" title="Contributing" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
+        <a href="zipformer_transducer.html" class="btn btn-neutral float-right" title="Zipformer Transducer" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
     </div>
 
   <hr/>
diff --git a/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.html b/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.html
index c706d47a4..b53b33d6e 100644
--- a/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.html
+++ b/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.html
@@ -52,6 +52,7 @@
 <li class="toctree-l3 current"><a class="reference internal" href="index.html">LibriSpeech</a><ul class="current">
 <li class="toctree-l4 current"><a class="current reference internal" href="#">Pruned transducer statelessX</a></li>
 <li class="toctree-l4"><a class="reference internal" href="lstm_pruned_stateless_transducer.html">LSTM Transducer</a></li>
+<li class="toctree-l4"><a class="reference internal" href="zipformer_transducer.html">Zipformer Transducer</a></li>
 </ul>
 </li>
 </ul>
diff --git a/recipes/Streaming-ASR/librispeech/zipformer_transducer.html b/recipes/Streaming-ASR/librispeech/zipformer_transducer.html
new file mode 100644
index 000000000..247b753b5
--- /dev/null
+++ b/recipes/Streaming-ASR/librispeech/zipformer_transducer.html
@@ -0,0 +1,752 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Zipformer Transducer &mdash; icefall 0.1 documentation</title>
+      <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
+  <!--[if lt IE 9]>
+    <script src="../../../_static/js/html5shiv.min.js"></script>
+  <![endif]-->
+  
+        <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
+        <script src="../../../_static/jquery.js"></script>
+        <script src="../../../_static/underscore.js"></script>
+        <script src="../../../_static/_sphinx_javascript_frameworks_compat.js"></script>
+        <script src="../../../_static/doctools.js"></script>
+        <script src="../../../_static/sphinx_highlight.js"></script>
+    <script src="../../../_static/js/theme.js"></script>
+    <link rel="index" title="Index" href="../../../genindex.html" />
+    <link rel="search" title="Search" href="../../../search.html" />
+    <link rel="next" title="Contributing" href="../../../contributing/index.html" />
+    <link rel="prev" title="LSTM Transducer" href="lstm_pruned_stateless_transducer.html" /> 
+</head>
+
+<body class="wy-body-for-nav"> 
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search" >
+            <a href="../../../index.html" class="icon icon-home"> icefall
+          </a>
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../installation/index.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../model-export/index.html">Model export</a></li>
+</ul>
+<ul class="current">
+<li class="toctree-l1 current"><a class="reference internal" href="../../index.html">Recipes</a><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="../../Non-streaming-ASR/index.html">Non Streaming ASR</a></li>
+<li class="toctree-l2 current"><a class="reference internal" href="../index.html">Streaming ASR</a><ul class="current">
+<li class="toctree-l3"><a class="reference internal" href="../introduction.html">Introduction</a></li>
+<li class="toctree-l3 current"><a class="reference internal" href="index.html">LibriSpeech</a><ul class="current">
+<li class="toctree-l4"><a class="reference internal" href="pruned_transducer_stateless.html">Pruned transducer statelessX</a></li>
+<li class="toctree-l4"><a class="reference internal" href="lstm_pruned_stateless_transducer.html">LSTM Transducer</a></li>
+<li class="toctree-l4 current"><a class="current reference internal" href="#">Zipformer Transducer</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../contributing/index.html">Contributing</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../huggingface/index.html">Huggingface</a></li>
+</ul>
+
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../../../index.html">icefall</a>
+      </nav>
+
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="Page navigation">
+  <ul class="wy-breadcrumbs">
+      <li><a href="../../../index.html" class="icon icon-home"></a></li>
+          <li class="breadcrumb-item"><a href="../../index.html">Recipes</a></li>
+          <li class="breadcrumb-item"><a href="../index.html">Streaming ASR</a></li>
+          <li class="breadcrumb-item"><a href="index.html">LibriSpeech</a></li>
+      <li class="breadcrumb-item active">Zipformer Transducer</li>
+      <li class="wy-breadcrumbs-aside">
+              <a href="https://github.com/k2-fsa/icefall/blob/master/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst" class="fa fa-github"> Edit on GitHub</a>
+      </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+             
+  <section id="zipformer-transducer">
+<h1>Zipformer Transducer<a class="headerlink" href="#zipformer-transducer" title="Permalink to this heading"></a></h1>
+<p>This tutorial shows you how to run a <strong>streaming</strong> zipformer transducer model
+with the <a class="reference external" href="https://www.openslr.org/12">LibriSpeech</a> dataset.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>The tutorial is suitable for <a class="reference external" href="https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming">pruned_transducer_stateless7_streaming</a>,</p>
+</div>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>We assume you have read the page <a class="reference internal" href="../../../installation/index.html#install-icefall"><span class="std std-ref">Installation</span></a> and have setup
+the environment for <code class="docutils literal notranslate"><span class="pre">icefall</span></code>.</p>
+</div>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>We recommend you to use a GPU or several GPUs to run this recipe.</p>
+</div>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>Please scroll down to the bottom of this page to find download links
+for pretrained models if you don’t want to train a model from scratch.</p>
+</div>
+<p>We use pruned RNN-T to compute the loss.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>You can find the paper about pruned RNN-T at the following address:</p>
+<p><a class="reference external" href="https://arxiv.org/abs/2206.13236">https://arxiv.org/abs/2206.13236</a></p>
+</div>
+<p>The transducer model consists of 3 parts:</p>
+<blockquote>
+<div><ul class="simple">
+<li><p>Encoder, a.k.a, the transcription network. We use a Zipformer model (proposed by Daniel Povey)</p></li>
+<li><p>Decoder, a.k.a, the prediction network. We use a stateless model consisting of
+<code class="docutils literal notranslate"><span class="pre">nn.Embedding</span></code> and <code class="docutils literal notranslate"><span class="pre">nn.Conv1d</span></code></p></li>
+<li><p>Joiner, a.k.a, the joint network.</p></li>
+</ul>
+</div></blockquote>
+<div class="admonition caution">
+<p class="admonition-title">Caution</p>
+<p>Contrary to the conventional RNN-T models, we use a stateless decoder.
+That is, it has no recurrent connections.</p>
+</div>
+<section id="data-preparation">
+<h2>Data preparation<a class="headerlink" href="#data-preparation" title="Permalink to this heading"></a></h2>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>The data preparation is the same as other recipes on LibriSpeech dataset,
+if you have finished this step, you can skip to <code class="docutils literal notranslate"><span class="pre">Training</span></code> directly.</p>
+</div>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ ./prepare.sh
+</pre></div>
+</div>
+<p>The script <code class="docutils literal notranslate"><span class="pre">./prepare.sh</span></code> handles the data preparation for you, <strong>automagically</strong>.
+All you need to do is to run it.</p>
+<p>The data preparation contains several stages, you can use the following two
+options:</p>
+<blockquote>
+<div><ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">--stage</span></code></p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--stop-stage</span></code></p></li>
+</ul>
+</div></blockquote>
+<p>to control which stage(s) should be run. By default, all stages are executed.</p>
+<p>For example,</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ ./prepare.sh --stage <span class="m">0</span> --stop-stage <span class="m">0</span>
+</pre></div>
+</div>
+<p>means to run only stage 0.</p>
+<p>To run stage 2 to stage 5, use:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ ./prepare.sh --stage <span class="m">2</span> --stop-stage <span class="m">5</span>
+</pre></div>
+</div>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>If you have pre-downloaded the <a class="reference external" href="https://www.openslr.org/12">LibriSpeech</a>
+dataset and the <a class="reference external" href="http://www.openslr.org/17/">musan</a> dataset, say,
+they are saved in <code class="docutils literal notranslate"><span class="pre">/tmp/LibriSpeech</span></code> and <code class="docutils literal notranslate"><span class="pre">/tmp/musan</span></code>, you can modify
+the <code class="docutils literal notranslate"><span class="pre">dl_dir</span></code> variable in <code class="docutils literal notranslate"><span class="pre">./prepare.sh</span></code> to point to <code class="docutils literal notranslate"><span class="pre">/tmp</span></code> so that
+<code class="docutils literal notranslate"><span class="pre">./prepare.sh</span></code> won’t re-download them.</p>
+</div>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>All generated files by <code class="docutils literal notranslate"><span class="pre">./prepare.sh</span></code>, e.g., features, lexicon, etc,
+are saved in <code class="docutils literal notranslate"><span class="pre">./data</span></code> directory.</p>
+</div>
+<p>We provide the following YouTube video showing how to run <code class="docutils literal notranslate"><span class="pre">./prepare.sh</span></code>.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>To get the latest news of <a class="reference external" href="https://github.com/k2-fsa">next-gen Kaldi</a>, please subscribe
+the following YouTube channel by <a class="reference external" href="https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw">Nadira Povey</a>:</p>
+<blockquote>
+<div><p><a class="reference external" href="https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw">https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw</a></p>
+</div></blockquote>
+</div>
+<div class="video_wrapper" style="">
+<iframe allowfullscreen="true" src="https://www.youtube.com/embed/ofEIoJL-mGM" style="border: 0; height: 345px; width: 560px">
+</iframe></div></section>
+<section id="training">
+<h2>Training<a class="headerlink" href="#training" title="Permalink to this heading"></a></h2>
+<section id="configurable-options">
+<h3>Configurable options<a class="headerlink" href="#configurable-options" title="Permalink to this heading"></a></h3>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ ./pruned_transducer_stateless7_streaming/train.py --help
+</pre></div>
+</div>
+<p>shows you the training options that can be passed from the commandline.
+The following options are used quite often:</p>
+<blockquote>
+<div><ul>
+<li><p><code class="docutils literal notranslate"><span class="pre">--exp-dir</span></code></p>
+<p>The directory to save checkpoints, training logs and tensorboard.</p>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--full-libri</span></code></p>
+<p>If it’s True, the training part uses all the training data, i.e.,
+960 hours. Otherwise, the training part uses only the subset
+<code class="docutils literal notranslate"><span class="pre">train-clean-100</span></code>, which has 100 hours of training data.</p>
+<div class="admonition caution">
+<p class="admonition-title">Caution</p>
+<p>The training set is perturbed by speed with two factors: 0.9 and 1.1.
+If <code class="docutils literal notranslate"><span class="pre">--full-libri</span></code> is True, each epoch actually processes
+<code class="docutils literal notranslate"><span class="pre">3x960</span> <span class="pre">==</span> <span class="pre">2880</span></code> hours of data.</p>
+</div>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--num-epochs</span></code></p>
+<p>It is the number of epochs to train. For instance,
+<code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/train.py</span> <span class="pre">--num-epochs</span> <span class="pre">30</span></code> trains for 30 epochs
+and generates <code class="docutils literal notranslate"><span class="pre">epoch-1.pt</span></code>, <code class="docutils literal notranslate"><span class="pre">epoch-2.pt</span></code>, …, <code class="docutils literal notranslate"><span class="pre">epoch-30.pt</span></code>
+in the folder <code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/exp</span></code>.</p>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--start-epoch</span></code></p>
+<p>It’s used to resume training.
+<code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/train.py</span> <span class="pre">--start-epoch</span> <span class="pre">10</span></code> loads the
+checkpoint <code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/exp/epoch-9.pt</span></code> and starts
+training from epoch 10, based on the state from epoch 9.</p>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--world-size</span></code></p>
+<p>It is used for multi-GPU single-machine DDP training.</p>
+<blockquote>
+<div><ul class="simple">
+<li><ol class="loweralpha simple">
+<li><p>If it is 1, then no DDP training is used.</p></li>
+</ol>
+</li>
+<li><ol class="loweralpha simple" start="2">
+<li><p>If it is 2, then GPU 0 and GPU 1 are used for DDP training.</p></li>
+</ol>
+</li>
+</ul>
+</div></blockquote>
+<p>The following shows some use cases with it.</p>
+<blockquote>
+<div><p><strong>Use case 1</strong>: You have 4 GPUs, but you only want to use GPU 0 and
+GPU 2 for training. You can do the following:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ <span class="nb">export</span> <span class="nv">CUDA_VISIBLE_DEVICES</span><span class="o">=</span><span class="s2">&quot;0,2&quot;</span>
+$ ./pruned_transducer_stateless7_streaming/train.py --world-size <span class="m">2</span>
+</pre></div>
+</div>
+</div></blockquote>
+<p><strong>Use case 2</strong>: You have 4 GPUs and you want to use all of them
+for training. You can do the following:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ ./pruned_transducer_stateless7_streaming/train.py --world-size <span class="m">4</span>
+</pre></div>
+</div>
+</div></blockquote>
+<p><strong>Use case 3</strong>: You have 4 GPUs but you only want to use GPU 3
+for training. You can do the following:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ <span class="nb">export</span> <span class="nv">CUDA_VISIBLE_DEVICES</span><span class="o">=</span><span class="s2">&quot;3&quot;</span>
+$ ./pruned_transducer_stateless7_streaming/train.py --world-size <span class="m">1</span>
+</pre></div>
+</div>
+</div></blockquote>
+</div></blockquote>
+<div class="admonition caution">
+<p class="admonition-title">Caution</p>
+<p>Only multi-GPU single-machine DDP training is implemented at present.
+Multi-GPU multi-machine DDP training will be added later.</p>
+</div>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--max-duration</span></code></p>
+<p>It specifies the number of seconds over all utterances in a
+batch, before <strong>padding</strong>.
+If you encounter CUDA OOM, please reduce it.</p>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>Due to padding, the number of seconds of all utterances in a
+batch will usually be larger than <code class="docutils literal notranslate"><span class="pre">--max-duration</span></code>.</p>
+<p>A larger value for <code class="docutils literal notranslate"><span class="pre">--max-duration</span></code> may cause OOM during training,
+while a smaller value may increase the training time. You have to
+tune it.</p>
+</div>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--use-fp16</span></code></p>
+<p>If it is True, the model will train with half precision, from our experiment
+results, by using half precision you can train with two times larger <code class="docutils literal notranslate"><span class="pre">--max-duration</span></code>
+so as to get almost 2X speed up.</p>
+<p>We recommend using <code class="docutils literal notranslate"><span class="pre">--use-fp16</span> <span class="pre">True</span></code>.</p>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--short-chunk-size</span></code></p>
+<p>When training a streaming attention model with chunk masking, the chunk size
+would be either max sequence length of current batch or uniformly sampled from
+(1, short_chunk_size). The default value is 50, you don’t have to change it most of the time.</p>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--num-left-chunks</span></code></p>
+<p>It indicates how many left context (in chunks) that can be seen when calculating attention.
+The default value is 4, you don’t have to change it most of the time.</p>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">--decode-chunk-len</span></code></p>
+<p>The chunk size for decoding (in frames before subsampling). It is used for validation.
+The default value is 32 (i.e., 320ms).</p>
+</li>
+</ul>
+</div></blockquote>
+</section>
+<section id="pre-configured-options">
+<h3>Pre-configured options<a class="headerlink" href="#pre-configured-options" title="Permalink to this heading"></a></h3>
+<p>There are some training options, e.g., number of encoder layers,
+encoder dimension, decoder dimension, number of warmup steps etc,
+that are not passed from the commandline.
+They are pre-configured by the function <code class="docutils literal notranslate"><span class="pre">get_params()</span></code> in
+<a class="reference external" href="https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py">pruned_transducer_stateless7_streaming/train.py</a></p>
+<p>You don’t need to change these pre-configured parameters. If you really need to change
+them, please modify <code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/train.py</span></code> directly.</p>
+</section>
+<section id="training-logs">
+<h3>Training logs<a class="headerlink" href="#training-logs" title="Permalink to this heading"></a></h3>
+<p>Training logs and checkpoints are saved in <code class="docutils literal notranslate"><span class="pre">--exp-dir</span></code> (e.g. <code class="docutils literal notranslate"><span class="pre">pruned_transducer_stateless7_streaming/exp</span></code>.
+You will find the following files in that directory:</p>
+<blockquote>
+<div><ul>
+<li><p><code class="docutils literal notranslate"><span class="pre">epoch-1.pt</span></code>, <code class="docutils literal notranslate"><span class="pre">epoch-2.pt</span></code>, …</p>
+<p>These are checkpoint files saved at the end of each epoch, containing model
+<code class="docutils literal notranslate"><span class="pre">state_dict</span></code> and optimizer <code class="docutils literal notranslate"><span class="pre">state_dict</span></code>.
+To resume training from some checkpoint, say <code class="docutils literal notranslate"><span class="pre">epoch-10.pt</span></code>, you can use:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ ./pruned_transducer_stateless7_streaming/train.py --start-epoch <span class="m">11</span>
+</pre></div>
+</div>
+</div></blockquote>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">checkpoint-436000.pt</span></code>, <code class="docutils literal notranslate"><span class="pre">checkpoint-438000.pt</span></code>, …</p>
+<p>These are checkpoint files saved every <code class="docutils literal notranslate"><span class="pre">--save-every-n</span></code> batches,
+containing model <code class="docutils literal notranslate"><span class="pre">state_dict</span></code> and optimizer <code class="docutils literal notranslate"><span class="pre">state_dict</span></code>.
+To resume training from some checkpoint, say <code class="docutils literal notranslate"><span class="pre">checkpoint-436000</span></code>, you can use:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ ./pruned_transducer_stateless7_streaming/train.py --start-batch <span class="m">436000</span>
+</pre></div>
+</div>
+</div></blockquote>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">tensorboard/</span></code></p>
+<p>This folder contains tensorBoard logs. Training loss, validation loss, learning
+rate, etc, are recorded in these logs. You can visualize them by:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> pruned_transducer_stateless7_streaming/exp/tensorboard
+$ tensorboard dev upload --logdir . --description <span class="s2">&quot;pruned transducer training for LibriSpeech with icefall&quot;</span>
+</pre></div>
+</div>
+</div></blockquote>
+</li>
+</ul>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>If you don’t have access to google, you can use the following command
+to view the tensorboard log locally:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span> pruned_transducer_stateless7_streaming/exp/tensorboard
+tensorboard --logdir . --port <span class="m">6008</span>
+</pre></div>
+</div>
+</div></blockquote>
+<p>It will print the following message:</p>
+<blockquote>
+<div><div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Serving</span> <span class="n">TensorBoard</span> <span class="n">on</span> <span class="n">localhost</span><span class="p">;</span> <span class="n">to</span> <span class="n">expose</span> <span class="n">to</span> <span class="n">the</span> <span class="n">network</span><span class="p">,</span> <span class="n">use</span> <span class="n">a</span> <span class="n">proxy</span> <span class="ow">or</span> <span class="k">pass</span> <span class="o">--</span><span class="n">bind_all</span>
+<span class="n">TensorBoard</span> <span class="mf">2.8.0</span> <span class="n">at</span> <span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">localhost</span><span class="p">:</span><span class="mi">6008</span><span class="o">/</span> <span class="p">(</span><span class="n">Press</span> <span class="n">CTRL</span><span class="o">+</span><span class="n">C</span> <span class="n">to</span> <span class="n">quit</span><span class="p">)</span>
+</pre></div>
+</div>
+</div></blockquote>
+<p>Now start your browser and go to <a class="reference external" href="http://localhost:6008">http://localhost:6008</a> to view the tensorboard
+logs.</p>
+</div>
+<ul>
+<li><p><code class="docutils literal notranslate"><span class="pre">log/log-train-xxxx</span></code></p>
+<p>It is the detailed training log in text format, same as the one
+you saw printed to the console during training.</p>
+</li>
+</ul>
+</div></blockquote>
+</section>
+<section id="usage-example">
+<h3>Usage example<a class="headerlink" href="#usage-example" title="Permalink to this heading"></a></h3>
+<p>You can use the following command to start the training using 4 GPUs:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span> <span class="nv">CUDA_VISIBLE_DEVICES</span><span class="o">=</span><span class="s2">&quot;0,1,2,3&quot;</span>
+./pruned_transducer_stateless7_streaming/train.py <span class="se">\</span>
+  --world-size <span class="m">4</span> <span class="se">\</span>
+  --num-epochs <span class="m">30</span> <span class="se">\</span>
+  --start-epoch <span class="m">1</span> <span class="se">\</span>
+  --use-fp16 <span class="m">1</span> <span class="se">\</span>
+  --exp-dir pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+  --full-libri <span class="m">1</span> <span class="se">\</span>
+  --max-duration <span class="m">550</span>
+</pre></div>
+</div>
+</section>
+</section>
+<section id="decoding">
+<h2>Decoding<a class="headerlink" href="#decoding" title="Permalink to this heading"></a></h2>
+<p>The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.</p>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>There are two kinds of checkpoints:</p>
+<blockquote>
+<div><ul class="simple">
+<li><p>(1) <code class="docutils literal notranslate"><span class="pre">epoch-1.pt</span></code>, <code class="docutils literal notranslate"><span class="pre">epoch-2.pt</span></code>, …, which are saved at the end
+of each epoch. You can pass <code class="docutils literal notranslate"><span class="pre">--epoch</span></code> to
+<code class="docutils literal notranslate"><span class="pre">pruned_transducer_stateless7_streaming/decode.py</span></code> to use them.</p></li>
+<li><p>(2) <code class="docutils literal notranslate"><span class="pre">checkpoints-436000.pt</span></code>, <code class="docutils literal notranslate"><span class="pre">epoch-438000.pt</span></code>, …, which are saved
+every <code class="docutils literal notranslate"><span class="pre">--save-every-n</span></code> batches. You can pass <code class="docutils literal notranslate"><span class="pre">--iter</span></code> to
+<code class="docutils literal notranslate"><span class="pre">pruned_transducer_stateless7_streaming/decode.py</span></code> to use them.</p></li>
+</ul>
+<p>We suggest that you try both types of checkpoints and choose the one
+that produces the lowest WERs.</p>
+</div></blockquote>
+</div>
+<div class="admonition tip">
+<p class="admonition-title">Tip</p>
+<p>To decode a streaming model, you can use either <code class="docutils literal notranslate"><span class="pre">simulate</span> <span class="pre">streaming</span> <span class="pre">decoding</span></code> in <code class="docutils literal notranslate"><span class="pre">decode.py</span></code> or
+<code class="docutils literal notranslate"><span class="pre">real</span> <span class="pre">chunk-wise</span> <span class="pre">streaming</span> <span class="pre">decoding</span></code> in <code class="docutils literal notranslate"><span class="pre">streaming_decode.py</span></code>. The difference between <code class="docutils literal notranslate"><span class="pre">decode.py</span></code> and
+<code class="docutils literal notranslate"><span class="pre">streaming_decode.py</span></code> is that, <code class="docutils literal notranslate"><span class="pre">decode.py</span></code> processes the whole acoustic frames at one time with masking (i.e. same as training),
+but <code class="docutils literal notranslate"><span class="pre">streaming_decode.py</span></code> processes the acoustic frames chunk by chunk.</p>
+</div>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p><code class="docutils literal notranslate"><span class="pre">simulate</span> <span class="pre">streaming</span> <span class="pre">decoding</span></code> in <code class="docutils literal notranslate"><span class="pre">decode.py</span></code> and <code class="docutils literal notranslate"><span class="pre">real</span> <span class="pre">chunk-size</span> <span class="pre">streaming</span> <span class="pre">decoding</span></code> in <code class="docutils literal notranslate"><span class="pre">streaming_decode.py</span></code> should
+produce almost the same results given the same <code class="docutils literal notranslate"><span class="pre">--decode-chunk-len</span></code>.</p>
+</div>
+<section id="simulate-streaming-decoding">
+<h3>Simulate streaming decoding<a class="headerlink" href="#simulate-streaming-decoding" title="Permalink to this heading"></a></h3>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ ./pruned_transducer_stateless7_streaming/decode.py --help
+</pre></div>
+</div>
+<p>shows the options for decoding.
+The following options are important for streaming models:</p>
+<blockquote>
+<div><p><code class="docutils literal notranslate"><span class="pre">--decode-chunk-len</span></code></p>
+<blockquote>
+<div><p>It is same as in <code class="docutils literal notranslate"><span class="pre">train.py</span></code>, which specifies the chunk size for decoding (in frames before subsampling).
+The default value is 32 (i.e., 320ms).</p>
+</div></blockquote>
+</div></blockquote>
+<p>The following shows two examples (for the two types of checkpoints):</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> m <span class="k">in</span> greedy_search fast_beam_search modified_beam_search<span class="p">;</span> <span class="k">do</span>
+  <span class="k">for</span> epoch <span class="k">in</span> <span class="m">30</span><span class="p">;</span> <span class="k">do</span>
+    <span class="k">for</span> avg <span class="k">in</span> <span class="m">12</span> <span class="m">11</span> <span class="m">10</span> <span class="m">9</span> <span class="m">8</span><span class="p">;</span> <span class="k">do</span>
+      ./pruned_transducer_stateless7_streaming/decode.py <span class="se">\</span>
+        --epoch <span class="nv">$epoch</span> <span class="se">\</span>
+        --avg <span class="nv">$avg</span> <span class="se">\</span>
+        --decode-chunk-len <span class="m">32</span> <span class="se">\</span>
+        --exp-dir pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+        --max-duration <span class="m">600</span> <span class="se">\</span>
+        --decoding-method <span class="nv">$m</span>
+    <span class="k">done</span>
+  <span class="k">done</span>
+<span class="k">done</span>
+</pre></div>
+</div>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> m <span class="k">in</span> greedy_search fast_beam_search modified_beam_search<span class="p">;</span> <span class="k">do</span>
+  <span class="k">for</span> iter <span class="k">in</span> <span class="m">474000</span><span class="p">;</span> <span class="k">do</span>
+    <span class="k">for</span> avg <span class="k">in</span> <span class="m">8</span> <span class="m">10</span> <span class="m">12</span> <span class="m">14</span> <span class="m">16</span> <span class="m">18</span><span class="p">;</span> <span class="k">do</span>
+      ./pruned_transducer_stateless7_streaming/decode.py <span class="se">\</span>
+        --iter <span class="nv">$iter</span> <span class="se">\</span>
+        --avg <span class="nv">$avg</span> <span class="se">\</span>
+        --decode-chunk-len <span class="m">32</span> <span class="se">\</span>
+        --exp-dir pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+        --max-duration <span class="m">600</span> <span class="se">\</span>
+        --decoding-method <span class="nv">$m</span>
+    <span class="k">done</span>
+  <span class="k">done</span>
+<span class="k">done</span>
+</pre></div>
+</div>
+</section>
+<section id="real-streaming-decoding">
+<h3>Real streaming decoding<a class="headerlink" href="#real-streaming-decoding" title="Permalink to this heading"></a></h3>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$ <span class="nb">cd</span> egs/librispeech/ASR
+$ ./pruned_transducer_stateless7_streaming/streaming_decode.py --help
+</pre></div>
+</div>
+<p>shows the options for decoding.
+The following options are important for streaming models:</p>
+<blockquote>
+<div><p><code class="docutils literal notranslate"><span class="pre">--decode-chunk-len</span></code></p>
+<blockquote>
+<div><p>It is same as in <code class="docutils literal notranslate"><span class="pre">train.py</span></code>, which specifies the chunk size for decoding (in frames before subsampling).
+The default value is 32 (i.e., 320ms).
+For <code class="docutils literal notranslate"><span class="pre">real</span> <span class="pre">streaming</span> <span class="pre">decoding</span></code>, we will process <code class="docutils literal notranslate"><span class="pre">decode-chunk-len</span></code> acoustic frames at each time.</p>
+</div></blockquote>
+<p><code class="docutils literal notranslate"><span class="pre">--num-decode-streams</span></code></p>
+<blockquote>
+<div><p>The number of decoding streams that can be run in parallel (very similar to the <code class="docutils literal notranslate"><span class="pre">bath</span> <span class="pre">size</span></code>).
+For <code class="docutils literal notranslate"><span class="pre">real</span> <span class="pre">streaming</span> <span class="pre">decoding</span></code>, the batches will be packed dynamically, for example, if the
+<code class="docutils literal notranslate"><span class="pre">num-decode-streams</span></code> equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
+suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.</p>
+</div></blockquote>
+</div></blockquote>
+<p>The following shows two examples (for the two types of checkpoints):</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> m <span class="k">in</span> greedy_search fast_beam_search modified_beam_search<span class="p">;</span> <span class="k">do</span>
+  <span class="k">for</span> epoch <span class="k">in</span> <span class="m">30</span><span class="p">;</span> <span class="k">do</span>
+    <span class="k">for</span> avg <span class="k">in</span> <span class="m">12</span> <span class="m">11</span> <span class="m">10</span> <span class="m">9</span> <span class="m">8</span><span class="p">;</span> <span class="k">do</span>
+      ./pruned_transducer_stateless7_streaming/decode.py <span class="se">\</span>
+        --epoch <span class="nv">$epoch</span> <span class="se">\</span>
+        --avg <span class="nv">$avg</span> <span class="se">\</span>
+        --decode-chunk-len <span class="m">32</span> <span class="se">\</span>
+        --num-decode-streams <span class="m">100</span> <span class="se">\</span>
+        --exp-dir pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+        --decoding-method <span class="nv">$m</span>
+    <span class="k">done</span>
+  <span class="k">done</span>
+<span class="k">done</span>
+</pre></div>
+</div>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> m <span class="k">in</span> greedy_search fast_beam_search modified_beam_search<span class="p">;</span> <span class="k">do</span>
+  <span class="k">for</span> iter <span class="k">in</span> <span class="m">474000</span><span class="p">;</span> <span class="k">do</span>
+    <span class="k">for</span> avg <span class="k">in</span> <span class="m">8</span> <span class="m">10</span> <span class="m">12</span> <span class="m">14</span> <span class="m">16</span> <span class="m">18</span><span class="p">;</span> <span class="k">do</span>
+      ./pruned_transducer_stateless7_streaming/decode.py <span class="se">\</span>
+        --iter <span class="nv">$iter</span> <span class="se">\</span>
+        --avg <span class="nv">$avg</span> <span class="se">\</span>
+        --decode-chunk-len <span class="m">16</span> <span class="se">\</span>
+        --num-decode-streams <span class="m">100</span> <span class="se">\</span>
+        --exp-dir pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+        --decoding-method <span class="nv">$m</span>
+    <span class="k">done</span>
+  <span class="k">done</span>
+<span class="k">done</span>
+</pre></div>
+</div>
+<div class="admonition tip">
+<p class="admonition-title">Tip</p>
+<p>Supporting decoding methods are as follows:</p>
+<blockquote>
+<div><ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">greedy_search</span></code> : It takes the symbol with largest posterior probability
+of each frame as the decoding result.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">beam_search</span></code> :  It implements Algorithm 1 in <a class="reference external" href="https://arxiv.org/pdf/1211.3711.pdf">https://arxiv.org/pdf/1211.3711.pdf</a> and
+<a class="reference external" href="https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247">espnet/nets/beam_search_transducer.py</a>
+is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
+next frame.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">modified_beam_search</span></code> : It implements the same algorithm as <code class="docutils literal notranslate"><span class="pre">beam_search</span></code> above, but it
+runs in batch mode with <code class="docutils literal notranslate"><span class="pre">--max-sym-per-frame=1</span></code> being hardcoded.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">fast_beam_search</span></code> : It implements graph composition between the output <code class="docutils literal notranslate"><span class="pre">log_probs</span></code> and
+given <code class="docutils literal notranslate"><span class="pre">FSAs</span></code>. It is hard to describe the details in several lines of texts, you can read
+our paper in <a class="reference external" href="https://arxiv.org/pdf/2211.00484.pdf">https://arxiv.org/pdf/2211.00484.pdf</a> or our <a class="reference external" href="https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h">rnnt decode code in k2</a>. <code class="docutils literal notranslate"><span class="pre">fast_beam_search</span></code> can decode with <code class="docutils literal notranslate"><span class="pre">FSAs</span></code> on GPU efficiently.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">fast_beam_search_LG</span></code> : The same as <code class="docutils literal notranslate"><span class="pre">fast_beam_search</span></code> above, <code class="docutils literal notranslate"><span class="pre">fast_beam_search</span></code> uses
+an trivial graph that has only one state, while <code class="docutils literal notranslate"><span class="pre">fast_beam_search_LG</span></code> uses an LG graph
+(with N-gram LM).</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">fast_beam_search_nbest</span></code> : It produces the decoding results as follows:</p>
+<ul>
+<li><ol class="arabic simple">
+<li><p>Use <code class="docutils literal notranslate"><span class="pre">fast_beam_search</span></code> to get a lattice</p></li>
+</ol>
+</li>
+<li><ol class="arabic simple" start="2">
+<li><p>Select <code class="docutils literal notranslate"><span class="pre">num_paths</span></code> paths from the lattice using <code class="docutils literal notranslate"><span class="pre">k2.random_paths()</span></code></p></li>
+</ol>
+</li>
+<li><ol class="arabic simple" start="3">
+<li><p>Unique the selected paths</p></li>
+</ol>
+</li>
+<li><ol class="arabic simple" start="4">
+<li><p>Intersect the selected paths with the lattice and compute the
+shortest path from the intersection result</p></li>
+</ol>
+</li>
+<li><ol class="arabic simple" start="5">
+<li><p>The path with the largest score is used as the decoding output.</p></li>
+</ol>
+</li>
+</ul>
+</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">fast_beam_search_nbest_LG</span></code> : It implements same logic as <code class="docutils literal notranslate"><span class="pre">fast_beam_search_nbest</span></code>, the
+only difference is that it uses <code class="docutils literal notranslate"><span class="pre">fast_beam_search_LG</span></code> to generate the lattice.</p></li>
+</ul>
+</div></blockquote>
+</div>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>The supporting decoding methods in <code class="docutils literal notranslate"><span class="pre">streaming_decode.py</span></code> might be less than that in <code class="docutils literal notranslate"><span class="pre">decode.py</span></code>, if needed,
+you can implement them by yourself or file a issue in <a class="reference external" href="https://github.com/k2-fsa/icefall/issues">icefall</a> .</p>
+</div>
+</section>
+</section>
+<section id="export-model">
+<h2>Export Model<a class="headerlink" href="#export-model" title="Permalink to this heading"></a></h2>
+<p>Currently it supports exporting checkpoints from <code class="docutils literal notranslate"><span class="pre">pruned_transducer_stateless7_streaming/exp</span></code> in the following ways.</p>
+<section id="export-model-state-dict">
+<h3>Export <code class="docutils literal notranslate"><span class="pre">model.state_dict()</span></code><a class="headerlink" href="#export-model-state-dict" title="Permalink to this heading"></a></h3>
+<p>Checkpoints saved by <code class="docutils literal notranslate"><span class="pre">pruned_transducer_stateless7_streaming/train.py</span></code> also include
+<code class="docutils literal notranslate"><span class="pre">optimizer.state_dict()</span></code>. It is useful for resuming training. But after training,
+we are interested only in <code class="docutils literal notranslate"><span class="pre">model.state_dict()</span></code>. You can use the following
+command to extract <code class="docutils literal notranslate"><span class="pre">model.state_dict()</span></code>.</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Assume that --epoch 30 --avg 9 produces the smallest WER</span>
+<span class="c1"># (You can get such information after running ./pruned_transducer_stateless7_streaming/decode.py)</span>
+
+<span class="nv">epoch</span><span class="o">=</span><span class="m">30</span>
+<span class="nv">avg</span><span class="o">=</span><span class="m">9</span>
+
+./pruned_transducer_stateless7_streaming/export.py <span class="se">\</span>
+  --exp-dir ./pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+  --bpe-model data/lang_bpe_500/bpe.model <span class="se">\</span>
+  --epoch <span class="nv">$epoch</span> <span class="se">\</span>
+  --avg  <span class="nv">$avg</span> <span class="se">\</span>
+  --use-averaged-model<span class="o">=</span>True <span class="se">\</span>
+  --decode-chunk-len <span class="m">32</span>
+</pre></div>
+</div>
+<p>It will generate a file <code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/exp/pretrained.pt</span></code>.</p>
+<div class="admonition hint">
+<p class="admonition-title">Hint</p>
+<p>To use the generated <code class="docutils literal notranslate"><span class="pre">pretrained.pt</span></code> for <code class="docutils literal notranslate"><span class="pre">pruned_transducer_stateless7_streaming/decode.py</span></code>,
+you can run:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span> pruned_transducer_stateless7_streaming/exp
+ln -s pretrained.pt epoch-999.pt
+</pre></div>
+</div>
+<p>And then pass <code class="docutils literal notranslate"><span class="pre">--epoch</span> <span class="pre">999</span> <span class="pre">--avg</span> <span class="pre">1</span> <span class="pre">--use-averaged-model</span> <span class="pre">0</span></code> to
+<code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/decode.py</span></code>.</p>
+</div>
+<p>To use the exported model with <code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/pretrained.py</span></code>, you
+can run:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./pruned_transducer_stateless7_streaming/pretrained.py <span class="se">\</span>
+  --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt <span class="se">\</span>
+  --bpe-model ./data/lang_bpe_500/bpe.model <span class="se">\</span>
+  --method greedy_search <span class="se">\</span>
+  --decode-chunk-len <span class="m">32</span> <span class="se">\</span>
+  /path/to/foo.wav <span class="se">\</span>
+  /path/to/bar.wav
+</pre></div>
+</div>
+</section>
+<section id="export-model-using-torch-jit-script">
+<h3>Export model using <code class="docutils literal notranslate"><span class="pre">torch.jit.script()</span></code><a class="headerlink" href="#export-model-using-torch-jit-script" title="Permalink to this heading"></a></h3>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./pruned_transducer_stateless7_streaming/export.py <span class="se">\</span>
+  --exp-dir ./pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+  --bpe-model data/lang_bpe_500/bpe.model <span class="se">\</span>
+  --epoch <span class="m">30</span> <span class="se">\</span>
+  --avg <span class="m">9</span> <span class="se">\</span>
+  --decode-chunk-len <span class="m">32</span> <span class="se">\</span>
+  --jit <span class="m">1</span>
+</pre></div>
+</div>
+<div class="admonition caution">
+<p class="admonition-title">Caution</p>
+<p><code class="docutils literal notranslate"><span class="pre">--decode-chunk-len</span></code> is required to export a ScriptModule.</p>
+</div>
+<p>It will generate a file <code class="docutils literal notranslate"><span class="pre">cpu_jit.pt</span></code> in the given <code class="docutils literal notranslate"><span class="pre">exp_dir</span></code>. You can later
+load it by <code class="docutils literal notranslate"><span class="pre">torch.jit.load(&quot;cpu_jit.pt&quot;)</span></code>.</p>
+<p>Note <code class="docutils literal notranslate"><span class="pre">cpu</span></code> in the name <code class="docutils literal notranslate"><span class="pre">cpu_jit.pt</span></code> means the parameters when loaded into Python
+are on CPU. You can use <code class="docutils literal notranslate"><span class="pre">to(&quot;cuda&quot;)</span></code> to move them to a CUDA device.</p>
+</section>
+<section id="export-model-using-torch-jit-trace">
+<h3>Export model using <code class="docutils literal notranslate"><span class="pre">torch.jit.trace()</span></code><a class="headerlink" href="#export-model-using-torch-jit-trace" title="Permalink to this heading"></a></h3>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">epoch</span><span class="o">=</span><span class="m">30</span>
+<span class="nv">avg</span><span class="o">=</span><span class="m">9</span>
+
+./pruned_transducer_stateless7_streaming/jit_trace_export.py <span class="se">\</span>
+  --bpe-model data/lang_bpe_500/bpe.model <span class="se">\</span>
+  --use-averaged-model<span class="o">=</span>True <span class="se">\</span>
+  --decode-chunk-len <span class="m">32</span> <span class="se">\</span>
+  --exp-dir ./pruned_transducer_stateless7_streaming/exp <span class="se">\</span>
+  --epoch <span class="nv">$epoch</span> <span class="se">\</span>
+  --avg <span class="nv">$avg</span>
+</pre></div>
+</div>
+<div class="admonition caution">
+<p class="admonition-title">Caution</p>
+<p><code class="docutils literal notranslate"><span class="pre">--decode-chunk-len</span></code> is required to export a ScriptModule.</p>
+</div>
+<p>It will generate 3 files:</p>
+<blockquote>
+<div><ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt</span></code></p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt</span></code></p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt</span></code></p></li>
+</ul>
+</div></blockquote>
+<p>To use the generated files with <code class="docutils literal notranslate"><span class="pre">./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py</span></code>:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py <span class="se">\</span>
+  --encoder-model-filename ./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt <span class="se">\</span>
+  --decoder-model-filename ./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt <span class="se">\</span>
+  --joiner-model-filename ./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt <span class="se">\</span>
+  --bpe-model ./data/lang_bpe_500/bpe.model <span class="se">\</span>
+  --decode-chunk-len <span class="m">32</span> <span class="se">\</span>
+  /path/to/foo.wav
+</pre></div>
+</div>
+</section>
+</section>
+<section id="download-pretrained-models">
+<h2>Download pretrained models<a class="headerlink" href="#download-pretrained-models" title="Permalink to this heading"></a></h2>
+<p>If you don’t want to train from scratch, you can download the pretrained models
+by visiting the following links:</p>
+<blockquote>
+<div><ul class="simple">
+<li><p><a class="reference external" href="https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29">pruned_transducer_stateless7_streaming</a></p></li>
+</ul>
+<p>See <a class="reference external" href="https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md">https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md</a>
+for the details of the above pretrained models</p>
+</div></blockquote>
+</section>
+<section id="deploy-with-sherpa">
+<h2>Deploy with Sherpa<a class="headerlink" href="#deploy-with-sherpa" title="Permalink to this heading"></a></h2>
+<p>Please see <a class="reference external" href="https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#">https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#</a>
+for how to deploy the models in <code class="docutils literal notranslate"><span class="pre">sherpa</span></code>.</p>
+</section>
+</section>
+
+
+           </div>
+          </div>
+          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
+        <a href="lstm_pruned_stateless_transducer.html" class="btn btn-neutral float-left" title="LSTM Transducer" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
+        <a href="../../../contributing/index.html" class="btn btn-neutral float-right" title="Contributing" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>&#169; Copyright 2021, icefall development team.</p>
+  </div>
+
+  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
+    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
+    provided by <a href="https://readthedocs.org">Read the Docs</a>.
+   
+
+</footer>
+        </div>
+      </div>
+    </section>
+  </div>
+  <script>
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script> 
+
+</body>
+</html>
\ No newline at end of file
diff --git a/searchindex.js b/searchindex.js
index 02599e8c0..8a426c530 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["contributing/code-style", "contributing/doc", "contributing/how-to-create-a-recipe", "contributing/index", "huggingface/index", "huggingface/pretrained-models", "huggingface/spaces", "index", "installation/index", "model-export/export-model-state-dict", "model-export/export-ncnn", "model-export/export-onnx", "model-export/export-with-torch-jit-script", "model-export/export-with-torch-jit-trace", "model-export/index", "recipes/Non-streaming-ASR/aishell/conformer_ctc", "recipes/Non-streaming-ASR/aishell/index", "recipes/Non-streaming-ASR/aishell/stateless_transducer", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/index", "recipes/Non-streaming-ASR/librispeech/conformer_ctc", "recipes/Non-streaming-ASR/librispeech/index", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi", "recipes/Non-streaming-ASR/timit/index", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/yesno/index", "recipes/Non-streaming-ASR/yesno/tdnn", "recipes/Streaming-ASR/index", "recipes/Streaming-ASR/introduction", "recipes/Streaming-ASR/librispeech/index", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/index"], "filenames": ["contributing/code-style.rst", "contributing/doc.rst", "contributing/how-to-create-a-recipe.rst", "contributing/index.rst", "huggingface/index.rst", "huggingface/pretrained-models.rst", "huggingface/spaces.rst", "index.rst", "installation/index.rst", "model-export/export-model-state-dict.rst", "model-export/export-ncnn.rst", "model-export/export-onnx.rst", "model-export/export-with-torch-jit-script.rst", "model-export/export-with-torch-jit-trace.rst", "model-export/index.rst", "recipes/Non-streaming-ASR/aishell/conformer_ctc.rst", "recipes/Non-streaming-ASR/aishell/index.rst", "recipes/Non-streaming-ASR/aishell/stateless_transducer.rst", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/index.rst", "recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst", "recipes/Non-streaming-ASR/librispeech/index.rst", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst", "recipes/Non-streaming-ASR/timit/index.rst", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/yesno/index.rst", "recipes/Non-streaming-ASR/yesno/tdnn.rst", "recipes/Streaming-ASR/index.rst", "recipes/Streaming-ASR/introduction.rst", "recipes/Streaming-ASR/librispeech/index.rst", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/index.rst"], "titles": ["Follow the code style", "Contributing to Documentation", "How to create a recipe", "Contributing", "Huggingface", "Pre-trained models", "Huggingface spaces", "Icefall", "Installation", "Export model.state_dict()", "Export to ncnn", "Export to ONNX", "Export model with torch.jit.script()", "Export model with torch.jit.trace()", "Model export", "Conformer CTC", "aishell", "Stateless Transducer", "TDNN-LSTM CTC", "Non Streaming ASR", "Conformer CTC", "LibriSpeech", "Pruned transducer statelessX", "TDNN-LSTM-CTC", "Zipformer CTC Blank Skip", "Zipformer MMI", "TIMIT", "TDNN-LiGRU-CTC", "TDNN-LSTM-CTC", "YesNo", "TDNN-CTC", "Streaming ASR", "Introduction", "LibriSpeech", "LSTM Transducer", "Pruned transducer statelessX", "Recipes"], "terms": {"we": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "us": [0, 1, 2, 4, 6, 7, 8, 10, 14, 15, 16, 17, 18, 20, 23, 27, 28, 30, 32], "tool": [0, 34], "make": [0, 1, 3, 15, 17, 20, 32, 34], "consist": [0, 17, 22, 34, 35], "possibl": [0, 2, 3, 8, 15, 20], "black": 0, "format": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "flake8": 0, "check": [0, 20], "qualiti": [0, 16], "isort": 0, "sort": [0, 8], "import": [0, 35], "The": [0, 1, 2, 6, 8, 9, 10, 15, 16, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "version": [0, 8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 34, 35], "abov": [0, 8, 9, 15, 16, 17, 18, 20, 22, 24, 25, 30, 32, 34, 35], "ar": [0, 1, 3, 8, 9, 11, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "22": [0, 20, 27, 28, 30], "3": [0, 7, 9, 18, 22, 23, 24, 25, 30, 34, 35], "0": [0, 1, 7, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "5": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "4": [0, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "10": [0, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "1": [0, 7, 9, 11, 12, 13, 22, 23, 24, 25, 27, 28, 30, 34, 35], "after": [0, 1, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "run": [0, 2, 6, 8, 11, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "command": [0, 1, 8, 9, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "git": [0, 8, 9, 15, 17, 18, 20, 23, 27, 28, 30, 34], "clone": [0, 8, 9, 15, 17, 18, 20, 23, 27, 28, 30, 34], "http": [0, 1, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "github": [0, 2, 5, 8, 9, 10, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "com": [0, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "k2": [0, 2, 5, 6, 7, 9, 10, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35], "fsa": [0, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 17, 20, 22, 24, 25, 34, 35], "icefal": [0, 2, 3, 5, 6, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "cd": [0, 1, 2, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "pip": [0, 1, 8, 17, 34], "instal": [0, 1, 4, 6, 7, 9, 22, 24, 25, 30, 34, 35], "pre": [0, 3, 4, 6, 7, 8], "commit": 0, "whenev": 0, "you": [0, 1, 2, 5, 6, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "automat": [0, 6], "hook": 0, "invok": 0, "fail": [0, 8], "If": [0, 2, 6, 8, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "ani": [0, 8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "your": [0, 1, 2, 4, 6, 7, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "wa": [0, 8, 9, 20, 23], "success": [0, 8], "pleas": [0, 1, 2, 6, 8, 10, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "fix": [0, 8, 20], "issu": [0, 8, 20, 35], "report": [0, 8], "some": [0, 1, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "i": [0, 1, 2, 6, 8, 9, 10, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "e": [0, 2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "modifi": [0, 15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "file": [0, 2, 6, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "place": [0, 8, 9, 17, 20, 23], "so": [0, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "statu": 0, "failur": 0, "see": [0, 1, 6, 8, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "which": [0, 2, 6, 9, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 35], "ha": [0, 2, 10, 11, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 32, 34, 35], "been": [0, 10, 11, 17], "befor": [0, 1, 9, 12, 15, 17, 18, 20, 22, 24, 25, 34, 35], "further": 0, "chang": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "all": [0, 5, 6, 9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "again": [0, 30], "should": [0, 2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "succe": 0, "thi": [0, 2, 3, 4, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "time": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "succeed": 0, "want": [0, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "can": [0, 1, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "do": [0, 2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "Or": 0, "without": [0, 4, 6, 15, 20, 34], "your_changed_fil": 0, "py": [0, 2, 8, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "sphinx": 1, "write": [1, 2, 3], "have": [1, 2, 5, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "prepar": [1, 3, 9], "environ": [1, 15, 16, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35], "doc": [1, 9], "r": [1, 8, 27, 28], "requir": [1, 8, 35], "txt": [1, 8, 15, 17, 18, 20, 23, 27, 28, 30], "set": [1, 8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "up": [1, 8, 9, 15, 18, 20, 22, 23, 24, 25, 35], "readi": [1, 15, 20], "refer": [1, 2, 8, 9, 10, 12, 13, 15, 17, 18, 20, 22, 23, 24, 27, 28, 30, 32, 35], "restructuredtext": 1, "primer": 1, "familiar": 1, "build": [1, 8, 9, 15, 17, 20, 34], "local": [1, 8, 22, 24, 25, 34, 35], "preview": 1, "what": [1, 2, 8, 17, 32], "look": [1, 2, 5, 8, 15, 17, 18, 20], "like": [1, 2, 6, 8, 15, 17, 18, 20, 22, 24, 25, 30, 32, 34, 35], "publish": [1, 9, 16], "html": [1, 2, 8, 12, 13, 22, 34, 35], "gener": [1, 9, 11, 12, 13, 15, 17, 18, 20, 22, 24, 25, 34, 35], "view": [1, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "follow": [1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "python3": [1, 8, 34], "m": [1, 17, 22, 24, 25, 27, 28, 34, 35], "server": [1, 6, 8, 34], "It": [1, 2, 4, 8, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "print": [1, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "serv": [1, 22, 24, 25, 34, 35], "port": [1, 22, 24, 25, 34, 35], "8000": [1, 30], "open": [1, 9, 16, 17, 20], "browser": [1, 4, 6, 22, 24, 25, 34, 35], "go": [1, 15, 17, 20, 22, 24, 25, 34, 35], "read": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "code": [2, 3, 7, 15, 20, 22, 23, 27, 28, 30, 35], "style": [2, 3, 7], "adjust": 2, "sytl": 2, "design": 2, "python": [2, 8, 9, 12, 13, 15, 17, 20, 22, 24, 25, 34, 35], "recommend": [2, 8, 15, 17, 18, 20, 22, 35], "test": [2, 7, 9, 10, 11, 15, 17, 18, 20, 23, 24, 27, 28], "valid": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "dataset": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "lhots": [2, 7, 9, 15, 17, 20], "readthedoc": [2, 8], "io": [2, 8, 12, 13, 22, 34, 35], "en": [2, 8], "latest": [2, 6, 8, 20, 22, 23, 24, 25, 34, 35], "index": [2, 8, 12, 13, 34, 35], "yesno": [2, 7, 8, 19, 30, 36], "veri": [2, 3, 17, 27, 28, 30, 35], "good": 2, "exampl": [2, 6, 7, 9, 11, 12, 13, 23, 27, 28, 30], "speech": [2, 6, 7, 8, 10, 11, 16, 17, 30, 36], "pull": [2, 15, 17, 20, 32], "380": [2, 28], "show": [2, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "add": [2, 15, 17, 18, 35, 36], "new": [2, 3, 6, 8, 15, 16, 17, 18, 20, 22, 23, 24, 25, 30, 34, 35], "suppos": [2, 35], "would": [2, 8, 9, 20, 23, 35], "name": [2, 9, 15, 17, 22, 24, 25, 35], "foo": [2, 11, 13, 15, 20, 22, 24, 25, 34, 35], "eg": [2, 5, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "mkdir": [2, 15, 17, 18, 20, 23, 27, 28, 30, 34], "p": [2, 8, 17, 27, 28], "asr": [2, 5, 7, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "touch": 2, "sh": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "chmod": 2, "x": [2, 32], "simpl": [2, 17], "own": [2, 22, 35], "otherwis": [2, 15, 17, 20, 22, 24, 25, 34, 35], "librispeech": [2, 5, 7, 9, 11, 12, 13, 19, 20, 22, 23, 24, 25, 31, 32, 34, 35, 36], "assum": [2, 8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35], "fanci": 2, "call": 2, "bar": [2, 11, 13, 15, 20, 22, 24, 25, 34, 35], "organ": 2, "wai": [2, 3, 14, 22, 24, 25, 32, 34, 35], "readm": [2, 15, 17, 18, 20, 23, 27, 28, 30], "md": [2, 5, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "asr_datamodul": [2, 8], "pretrain": [2, 9, 11, 13, 15, 17, 18, 20, 23, 27, 28, 30], "For": [2, 5, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "instanc": [2, 5, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "tdnn": [2, 8, 16, 19, 21, 26, 29], "its": [2, 9, 13, 17, 24], "directori": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "structur": 2, "descript": [2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "contain": [2, 7, 9, 10, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "inform": [2, 9, 15, 17, 18, 20, 22, 23, 24, 27, 28, 30, 32, 34, 35], "g": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "wer": [2, 8, 9, 22, 23, 24, 25, 27, 28, 30, 34, 35], "etc": [2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "provid": [2, 6, 8, 9, 10, 11, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "pytorch": [2, 7, 17], "dataload": [2, 8], "take": [2, 9, 22, 30, 35], "input": [2, 9, 15, 17, 18, 20, 23, 27, 28, 30, 32], "checkpoint": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "save": [2, 8, 9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "dure": [2, 6, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "stage": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "": [2, 8, 9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "definit": 2, "neural": [2, 15, 20], "network": [2, 15, 17, 20, 22, 24, 25, 34, 35], "script": [2, 7, 8, 13, 14, 15, 17, 18, 20, 23, 27, 28, 30, 34], "infer": [2, 9, 11], "tdnn_lstm_ctc": [2, 18, 23, 28], "conformer_ctc": [2, 15, 20], "get": [2, 6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 30, 34, 35], "feel": [2, 34], "result": [2, 5, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "everi": [2, 9, 22, 24, 25, 34, 35], "kept": [2, 22, 35], "self": [2, 10, 32], "toler": 2, "duplic": 2, "among": [2, 8], "differ": [2, 8, 15, 16, 20, 22, 32, 34, 35], "invoc": 2, "help": [2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "blob": [2, 5, 9, 13, 22, 24, 25, 34, 35], "master": [2, 5, 9, 11, 12, 13, 17, 22, 24, 25, 34, 35], "transform": [2, 15, 20, 34], "conform": [2, 11, 12, 16, 17, 19, 21, 22, 24, 34, 35], "base": [2, 15, 17, 18, 20, 22, 24, 25, 34, 35], "lstm": [2, 10, 13, 16, 19, 21, 26, 31, 33], "attent": [2, 17, 18, 32, 35], "lm": [2, 8, 17, 22, 23, 27, 28, 30, 35], "rescor": [2, 18, 23, 25, 27, 28, 30], "demonstr": [2, 4, 6, 9], "consid": 2, "colab": 2, "notebook": 2, "welcom": 3, "There": [3, 15, 17, 18, 20, 22, 24, 25, 34, 35], "mani": [3, 35], "two": [3, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "them": [3, 4, 5, 6, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "To": [3, 6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "document": [3, 7, 9, 25], "repositori": 3, "recip": [3, 5, 7, 8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 30, 32, 34, 35], "In": [3, 6, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 23, 27, 28, 30, 32], "page": [3, 6, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "describ": [3, 4, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 27, 28, 35], "how": [3, 4, 6, 7, 8, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "creat": [3, 7, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "data": [3, 9, 11, 12, 13, 16], "train": [3, 4, 6, 7, 9, 12, 13, 32], "decod": [3, 6, 11, 13, 14], "model": [3, 4, 6, 7, 8, 10, 32], "section": [4, 8, 9, 11, 12, 13, 14, 15, 20], "find": [4, 5, 6, 8, 9, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "also": [4, 5, 8, 9, 10, 11, 13, 15, 17, 18, 20, 22, 24, 25, 30, 32, 34, 35], "try": [4, 6, 22, 24, 25, 34, 35], "from": [4, 6, 8, 9, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "within": [4, 6], "anyth": [4, 6], "space": [4, 7], "youtub": [4, 7, 20, 22, 23, 24, 25, 34, 35], "video": [4, 7, 20, 22, 23, 24, 25, 34, 35], "upload": [5, 6, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "huggingfac": [5, 7, 9, 15, 17, 18, 20, 23, 24, 25, 27, 28, 30, 34], "co": [5, 6, 9, 15, 16, 17, 18, 20, 23, 24, 25, 27, 28, 30, 34], "visit": [5, 6, 22, 24, 25, 34, 35], "link": [5, 8, 9, 10, 22, 24, 25, 34, 35], "search": [5, 6], "specif": [5, 17], "correspond": [5, 6], "aishel": [5, 7, 15, 17, 18, 19, 36], "gigaspeech": [5, 12, 34], "wenetspeech": [5, 12], "integr": 6, "framework": [6, 11, 22, 35], "sherpa": [6, 10, 11, 12, 13, 34], "need": [6, 8, 9, 10, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "download": [6, 7, 16], "window": [6, 10, 11, 34], "maco": [6, 10, 11, 34], "linux": [6, 10, 11, 34], "even": [6, 8], "ipad": 6, "phone": 6, "start": [6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "address": [6, 9, 17, 22, 25, 34, 35], "recognit": [6, 7, 10, 11, 16, 17, 30, 36], "screenshot": [6, 15, 17, 18, 20, 22, 30, 34, 35], "select": [6, 22, 23, 27, 28, 30, 34, 35], "languag": [6, 15, 17, 18], "current": [6, 8, 17, 32, 34, 35, 36], "chines": [6, 16, 17], "english": [6, 30, 34], "target": 6, "method": [6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35], "greedi": 6, "modified_beam_search": [6, 17, 22, 24, 34, 35], "choos": [6, 8, 22, 24, 25, 34, 35], "number": [6, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "activ": 6, "path": [6, 9, 11, 13, 15, 17, 18, 20, 22, 24, 25, 34, 35], "either": [6, 15, 17, 18, 20, 35], "record": [6, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "click": [6, 8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "button": 6, "submit": 6, "wait": 6, "moment": 6, "an": [6, 8, 9, 11, 12, 13, 15, 16, 17, 20, 22, 25, 30, 34, 35], "when": [6, 14, 17, 20, 22, 24, 25, 35], "bottom": [6, 22, 24, 25, 34, 35], "part": [6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "tabl": 6, "one": [6, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "next": [6, 8, 20, 22, 23, 24, 25, 34, 35], "gen": [6, 8, 20, 22, 23, 24, 25, 34, 35], "kaldi": [6, 8, 20, 22, 23, 24, 25, 34, 35], "subscrib": [6, 8, 20, 22, 23, 24, 25, 34, 35], "channel": [6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "nadira": [6, 8, 20, 22, 23, 24, 25, 34, 35], "povei": [6, 8, 20, 22, 23, 24, 25, 34, 35], "www": [6, 8, 16, 20, 22, 23, 24, 25, 34, 35], "uc_vaumpkminz1pnkfxan9mw": [6, 8, 20, 22, 23, 24, 25, 34, 35], "torchaudio": [7, 32], "2": [7, 9, 11, 22, 23, 24, 25, 27, 28, 30, 34, 35], "export": [7, 8, 15, 17, 18, 20, 23, 27, 28, 30], "state_dict": [7, 14, 15, 17, 18, 20, 23, 27, 28, 30], "torch": [7, 8, 9, 14, 15, 17, 20], "jit": [7, 14, 20], "trace": [7, 12, 14], "onnx": [7, 9, 14], "ncnn": [7, 14], "non": [7, 11, 20, 32, 35, 36], "stream": [7, 11, 15, 20, 27, 28, 34, 36], "timit": [7, 19, 27, 28, 36], "introduct": [7, 31, 36], "contribut": 7, "depend": [8, 15, 20, 34], "step": [8, 9, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "order": [8, 15, 18, 20, 23, 27, 28], "matter": 8, "org": [8, 16, 17, 22, 34, 35], "least": 8, "v1": [8, 15, 18, 20, 23, 27, 28], "9": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 30, 34, 35], "alreadi": [8, 9], "don": [8, 12, 15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "t": [8, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "replac": 8, "compil": [8, 15, 17, 20], "against": 8, "strongli": 8, "collect": 8, "variabl": [8, 15, 18, 20, 22, 24, 25, 34, 35], "pythonpath": 8, "point": [8, 9, 15, 18, 20, 22, 24, 25, 34, 35], "folder": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "tmp": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "setup": [8, 15, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35], "put": [8, 24, 35], "sever": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "same": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "switch": [8, 15, 20, 25], "just": [8, 32], "about": [8, 17, 22, 25, 34, 35], "virtualenv": 8, "8": [8, 9, 15, 17, 20, 22, 23, 24, 25, 30, 34, 35], "cpython3": 8, "6": [8, 15, 17, 20, 22, 23, 27, 28, 34], "final": [8, 9, 20, 23], "64": [8, 9, 17, 35], "1540m": 8, "creator": 8, "cpython3posix": 8, "dest": 8, "ceph": [8, 9, 15, 17, 20], "fj": [8, 9, 17, 20], "fangjun": [8, 9, 17, 20], "clear": 8, "fals": [8, 9, 15, 17, 20], "no_vcs_ignor": 8, "global": 8, "seeder": 8, "fromappdata": 8, "bundl": 8, "setuptool": 8, "wheel": 8, "via": [8, 12, 13], "copi": [8, 32], "app_data_dir": 8, "root": 8, "share": 8, "v": [8, 20, 27, 28], "irtualenv": 8, "ad": [8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "seed": 8, "packag": 8, "21": [8, 9, 15, 17, 20, 27, 28], "57": [8, 20, 23], "36": [8, 17, 20], "bashactiv": 8, "cshellactiv": 8, "fishactiv": 8, "powershellactiv": 8, "pythonactiv": 8, "xonshactiv": 8, "sourc": [8, 9, 15, 16, 17, 20], "bin": [8, 15, 20, 34], "dev20210822": 8, "cpu": [8, 9, 12, 15, 22, 24, 25, 30, 35], "torch1": 8, "f": [8, 27, 28], "nightli": 8, "whl": [8, 34], "2bcpu": 8, "cp38": 8, "linux_x86_64": 8, "mb": 8, "________________________________": 8, "185": [8, 15, 20, 30], "kb": [8, 27, 28], "graphviz": 8, "17": [8, 9, 15, 20, 27, 28, 34], "py3": 8, "none": [8, 15, 20], "18": [8, 15, 17, 18, 20, 22, 23, 27, 28, 34, 35], "cach": 8, "manylinux1_x86_64": 8, "831": [8, 17, 28], "type": [8, 9, 15, 17, 20, 22, 24, 25, 30, 32, 34, 35], "extens": 8, "typing_extens": 8, "26": [8, 17, 20, 28], "successfulli": 8, "probabl": [8, 17, 22, 24, 34, 35], "cuda": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35], "req": 8, "7b1b76ge": 8, "q": 8, "audioread": 8, "soundfil": 8, "post1": 8, "py2": 8, "7": [8, 9, 15, 18, 20, 22, 23, 27, 28, 34, 35], "97": [8, 15], "cytoolz": 8, "11": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "manylinux_2_17_x86_64": 8, "manylinux2014_x86_64": 8, "dataclass": 8, "14": [8, 9, 11, 12, 15, 20, 22, 23, 24, 27, 34, 35], "h5py": 8, "manylinux_2_12_x86_64": 8, "manylinux2010_x86_64": 8, "684": [8, 15, 30], "intervaltre": 8, "lilcom": 8, "numpi": 8, "15": [8, 9, 17, 18, 20, 27, 30], "40": [8, 18, 20, 23, 27, 28], "pyyaml": 8, "662": 8, "tqdm": 8, "62": [8, 20, 23], "76": [8, 30], "73": 8, "satisfi": 8, "lib": 8, "site": 8, "dev": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "2a1410b": 8, "clean": [8, 15, 17, 20, 22, 23, 24, 25, 34, 35], "toolz": 8, "55": [8, 18, 20, 27], "sortedcontain": 8, "29": [8, 15, 17, 18, 20, 23, 27, 28], "cffi": 8, "411": [8, 20], "pycpars": 8, "20": [8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 30, 35], "112": 8, "pypars": 8, "67": 8, "done": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "filenam": [8, 11, 12, 13, 24, 25, 34], "dev_2a1410b_clean": 8, "size": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "342242": 8, "sha256": 8, "f683444afa4dc0881133206b4646a": 8, "9d0f774224cc84000f55d0a67f6e4a37997": 8, "store": [8, 20], "ephem": 8, "ftu0qysz": 8, "7f": 8, "7a": 8, "8e": 8, "a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f": 8, "warn": 8, "built": 8, "invalid": [8, 20], "metadata": [8, 27, 28], "mandat": 8, "pep": 8, "440": 8, "packa": 8, "ging": 8, "deprec": [8, 17], "legaci": 8, "becaus": 8, "could": [8, 15, 18], "A": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 34, 35], "discuss": 8, "regard": 8, "pypa": 8, "sue": 8, "8368": 8, "inter": 8, "valtre": 8, "sor": 8, "tedcontain": 8, "remot": 8, "enumer": 8, "object": [8, 15, 17, 18, 22, 30, 34, 35], "500": [8, 9, 17, 20, 25, 34], "count": 8, "100": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35], "compress": 8, "308": [8, 15, 17, 18], "total": [8, 15, 17, 18, 20, 22, 23, 30, 34, 35], "delta": 8, "263": 8, "reus": 8, "307": 8, "102": [8, 15], "pack": [8, 35], "receiv": 8, "172": 8, "49": [8, 20, 28, 30], "kib": 8, "385": 8, "00": [8, 15, 17, 18, 20, 23, 27, 28, 30], "resolv": 8, "kaldilm": 8, "tar": 8, "gz": 8, "48": [8, 15, 17], "574": 8, "kaldialign": 8, "sentencepiec": [8, 20], "96": 8, "tensorboard": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "41": [8, 15, 17, 27, 30], "line": [8, 22, 35], "absl": 8, "absl_pi": 8, "13": [8, 9, 17, 18, 20, 23, 24, 27], "132": 8, "googl": [8, 22, 24, 25, 34, 35], "auth": 8, "oauthlib": 8, "google_auth_oauthlib": 8, "grpcio": 8, "24": [8, 18, 23, 27, 28, 30], "39": [8, 17, 20, 23, 27], "ment": 8, "12": [8, 9, 15, 17, 18, 20, 22, 24, 25, 27, 30, 34, 35], "requi": 8, "rement": 8, "protobuf": 8, "manylinux_2_5_x86_64": 8, "werkzeug": 8, "288": 8, "tensorboard_data_serv": 8, "google_auth": 8, "35": [8, 9, 17, 20, 34], "152": 8, "request": [8, 32], "plugin": 8, "wit": 8, "tensorboard_plugin_wit": 8, "781": 8, "markdown": 8, "six": 8, "16": [8, 9, 13, 15, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35], "cachetool": 8, "rsa": 8, "34": 8, "pyasn1": 8, "modul": [8, 24, 35], "pyasn1_modul": 8, "155": 8, "requests_oauthlib": 8, "23": [8, 15, 17, 18, 20, 27, 28, 30], "77": [8, 20], "urllib3": 8, "27": [8, 15, 17, 23, 28], "138": [8, 15, 17], "certifi": 8, "2017": 8, "2021": [8, 15, 18, 20, 23, 27, 28, 30], "30": [8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "145": 8, "charset": 8, "normal": [8, 23, 27, 28, 30, 35], "charset_norm": 8, "idna": 8, "59": [8, 18, 20], "146": 8, "897233": 8, "eccb906cafcd45bf9a7e1a1718e4534254bfb": 8, "f4c0d0cbc66eee6c88d68a63862": 8, "85": 8, "7d": 8, "63": [8, 17], "f2dd586369b8797cb36d213bf3a84a789eeb92db93d2e723c9": 8, "etool": 8, "oaut": 8, "hlib": 8, "let": [8, 15, 20, 34], "u": [8, 15, 17, 18, 20, 30, 34], "log": [8, 23, 27, 28, 30], "08": [8, 20, 23, 25, 27, 28, 30, 34], "19": [8, 9, 15, 20, 23, 27, 28], "main": [8, 15, 20, 32], "dl_dir": [8, 15, 18, 20, 22, 24, 25, 34, 35], "waves_yesno": 8, "49mb": 8, "03": [8, 9, 17, 20, 27, 28, 34], "39mb": 8, "manifest": 8, "31": [8, 20], "42": [8, 15, 20, 30], "comput": [8, 9, 15, 17, 18, 22, 23, 25, 27, 28, 30, 34, 35], "fbank": [8, 9, 15, 17, 18, 20, 23, 27, 28, 30], "32": [8, 15, 17, 18], "803": 8, "info": [8, 9, 15, 17, 18, 20, 23, 27, 28, 30], "compute_fbank_yesno": 8, "52": [8, 15, 20], "process": [8, 9, 15, 17, 18, 20, 22, 24, 25, 34, 35], "extract": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "featur": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "_______________________________________________________________": 8, "90": 8, "01": [8, 17, 18, 20], "80": [8, 9, 15, 17, 20], "57it": 8, "085": 8, "______________________________________________________________": 8, "248": [8, 17], "21it": 8, "lang": [8, 9, 17, 20, 25], "fcordre9": 8, "kaldilm_6899d26f2d684ad48f21025950cd2866": 8, "csrc": [8, 20], "arpa_file_pars": 8, "cc": 8, "void": 8, "arpafilepars": 8, "rea": 8, "d": [8, 27, 28], "std": 8, "istream": 8, "79": 8, "140": [8, 18], "gram": [8, 15, 17, 18, 22, 23, 25, 27, 28, 35], "89": [8, 15], "hlg": [8, 23, 27, 28, 30], "928": 8, "compile_hlg": 8, "120": 8, "lang_phon": [8, 18, 23, 27, 28, 30], "929": [8, 17], "lexicon": [8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "116": 8, "convert": [8, 20, 34], "l": [8, 17, 27, 28, 30, 34], "pt": [8, 9, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "linv": [8, 17, 20, 30], "931": 8, "ctc_topo": 8, "max_token_id": 8, "932": 8, "load": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "fst": [8, 17, 30], "intersect": [8, 22, 35], "933": 8, "lg": [8, 22, 25, 35], "shape": 8, "66": 8, "connect": [8, 9, 20, 22, 23, 34, 35], "68": [8, 20], "70": 8, "class": [8, 20], "tensor": [8, 15, 17, 18, 20, 22, 30, 34, 35], "71": [8, 20, 23], "determin": 8, "934": 8, "74": [8, 9], "_k2": 8, "raggedint": 8, "remov": [8, 15, 17, 18, 20, 23, 27, 28], "disambigu": 8, "symbol": [8, 17, 22, 35], "87": 8, "remove_epsilon": 8, "935": 8, "92": [8, 20], "arc": 8, "95": [8, 16], "compos": 8, "h": 8, "105": [8, 20], "936": 8, "107": [8, 23], "123": 8, "now": [8, 15, 20, 22, 23, 24, 25, 27, 28, 34, 35], "cuda_visible_devic": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "gpu": [8, 15, 17, 18, 20, 22, 24, 25, 27, 28, 30, 34, 35], "avail": [8, 9, 15, 17, 20, 23, 27, 28, 30, 34], "case": [8, 9, 22, 24, 25, 34, 35], "segment": 8, "fault": 8, "core": 8, "dump": 8, "error": [8, 20], "protocol_buffers_python_implement": 8, "more": [8, 15, 20, 30, 32, 34, 35], "674": 8, "interest": [8, 22, 24, 25, 34, 35], "given": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 35], "below": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "072": 8, "465": 8, "466": 8, "exp_dir": [8, 17, 20, 22, 24, 25, 35], "posixpath": [8, 17, 20], "exp": [8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "lang_dir": [8, 17, 20], "lr": [8, 17, 34], "feature_dim": [8, 9, 15, 17, 20, 30], "weight_decai": 8, "1e": 8, "06": [8, 9, 18, 20, 23, 30], "start_epoch": 8, "best_train_loss": [8, 9], "inf": [8, 9], "best_valid_loss": [8, 9], "best_train_epoch": [8, 9], "best_valid_epoch": [8, 9], "batch_idx_train": [8, 9], "log_interv": [8, 9], "valid_interv": [8, 9], "beam_siz": [8, 9, 17], "reduct": [8, 24], "sum": 8, "use_doub": 8, "le_scor": 8, "true": [8, 9, 15, 17, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "world_siz": 8, "master_port": 8, "12354": 8, "num_epoch": 8, "feature_dir": [8, 20], "max_dur": [8, 20], "bucketing_sampl": [8, 20], "num_bucket": [8, 20], "concatenate_cut": [8, 20], "duration_factor": [8, 20], "gap": [8, 20], "on_the_fly_feat": [8, 20], "shuffl": [8, 20], "return_cut": [8, 20], "num_work": [8, 20], "074": 8, "113": [8, 17, 20], "098": [8, 23], "cut": [8, 20], "240": [8, 15, 30], "149": [8, 20], "200": [8, 9, 15, 20, 27, 28, 30], "singlecutsampl": 8, "206": [8, 20], "219": [8, 17, 20], "246": [8, 17, 20, 27, 28], "357": 8, "416": 8, "epoch": [8, 9, 11, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "batch": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35], "avg": [8, 9, 11, 12, 13, 17, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "loss": [8, 15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "0789": 8, "848": 8, "5356": 8, "7556": 8, "301": [8, 9, 20], "432": [8, 20], "9972": 8, "best": [8, 15, 18, 20], "805": 8, "2436": 8, "5717": 8, "33": [8, 15, 16, 17, 20, 27], "109": [8, 15, 20], "4167": 8, "121": [8, 23], "325": 8, "2214": 8, "798": [8, 17], "0781": 8, "1343": 8, "065": 8, "0859": 8, "556": 8, "0421": 8, "0975": 8, "810": 8, "0431": 8, "824": 8, "657": 8, "0109": 8, "984": [8, 20], "0093": 8, "0096": 8, "50": [8, 9, 20, 22, 27, 34, 35], "239": [8, 17], "0104": 8, "0101": 8, "569": 8, "0092": 8, "819": [8, 27], "835": 8, "51": [8, 15, 20, 30], "024": 8, "0105": 8, "317": 8, "0099": 8, "0097": 8, "552": 8, "0108": 8, "869": 8, "0102": 8, "126": [8, 20], "128": [8, 20], "537": [8, 20], "192": [8, 20], "249": 8, "250": [8, 17, 23], "lm_dir": [8, 20], "search_beam": [8, 15, 20, 30], "output_beam": [8, 15, 20, 30], "min_active_st": [8, 15, 20, 30], "max_active_st": [8, 15, 20, 30], "10000": [8, 15, 20, 30], "use_double_scor": [8, 15, 20, 30], "193": 8, "213": [8, 30], "259": [8, 15], "devic": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 35], "217": [8, 15, 20], "279": [8, 20], "averag": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "userwarn": [8, 17], "floor_divid": 8, "futur": [8, 17, 36], "round": [8, 17], "toward": [8, 17], "trunc": [8, 17], "function": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "NOT": [8, 15, 17, 20, 30], "floor": [8, 17], "incorrect": [8, 17], "neg": [8, 17], "valu": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35], "keep": [8, 17, 22, 35], "behavior": [8, 17], "div": [8, 17], "b": [8, 17, 20, 27, 28], "rounding_mod": [8, 17], "actual": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35], "divis": [8, 17], "trigger": 8, "intern": 8, "aten": 8, "src": [8, 34], "nativ": 8, "binaryop": 8, "cpp": [8, 12], "450": [8, 15, 17, 18], "k": [8, 22, 27, 28, 34, 35], "n": [8, 15, 22, 24, 25, 27, 28, 34, 35], "220": [8, 17, 18, 20], "409": 8, "190": [8, 23], "until": [8, 20, 24], "571": [8, 20], "228": [8, 20], "transcript": [8, 15, 16, 17, 18, 20, 22, 23, 27, 28, 34, 35], "recog": [8, 17, 20], "test_set": [8, 30], "572": 8, "util": [8, 20], "ins": [8, 20, 30], "del": [8, 20, 30], "sub": [8, 20, 30], "573": 8, "236": 8, "wrote": [8, 20], "detail": [8, 10, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "stat": [8, 20], "err": [8, 17, 20], "299": 8, "congratul": [8, 15, 18, 20, 23, 27, 28, 30], "first": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "fun": 8, "debug": 8, "variou": [8, 14, 36], "problem": 8, "mai": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "encount": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35], "while": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35], "period": 9, "disk": 9, "optim": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "other": [9, 17, 20, 22, 23, 27, 28, 30, 32, 35, 36], "relat": [9, 15, 17, 20, 23, 27, 28, 30], "resum": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "howev": 9, "onli": [9, 11, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "strip": 9, "except": 9, "reduc": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "each": [9, 15, 17, 18, 20, 22, 24, 25, 32, 34, 35], "well": [9, 30, 36], "usag": [9, 12, 13, 23, 27, 28, 30], "pruned_transducer_stateless3": [9, 11, 12, 32], "almost": [9, 22, 32, 35], "dir": [9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "bpe": [9, 11, 12, 13, 20, 22, 24, 25, 34, 35], "lang_bpe_500": [9, 11, 12, 13, 20, 22, 24, 25, 34, 35], "dict": 9, "host": 9, "csukuangfj": [9, 15, 17, 18, 20, 23, 27, 28, 30, 34], "prune": [9, 17, 19, 21, 31, 32, 33, 34], "transduc": [9, 10, 11, 16, 19, 21, 31, 32, 33], "stateless3": 9, "2022": [9, 17, 22, 24, 25, 34, 35], "05": [9, 15, 17, 18, 20, 28], "lf": [9, 15, 17, 18, 20, 23, 25, 27, 28, 30], "repo": 9, "prefix": 9, "those": 9, "xxx": 9, "wave": [9, 11, 15, 20], "iter": [9, 13, 22, 24, 25, 34, 35], "1224000": 9, "greedy_search": [9, 17, 22, 24, 34, 35], "test_wav": [9, 15, 17, 18, 20, 23, 27, 28, 30], "1089": [9, 20, 23], "134686": [9, 20, 23], "0001": [9, 20, 23], "wav": [9, 11, 13, 15, 17, 18, 20, 22, 24, 25, 27, 28, 30, 34, 35], "1221": [9, 20, 23], "135766": [9, 20, 23], "0002": [9, 20, 23], "multipl": [9, 15, 17, 18, 20, 23, 27, 28, 30], "sound": [9, 13, 15, 17, 18, 20, 23, 27, 28, 30], "Its": [9, 20], "output": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "09": [9, 15, 17, 18, 20, 34], "02": [9, 17, 20, 22, 28, 34, 35], "233": 9, "265": 9, "reset_interv": 9, "3000": 9, "subsampling_factor": [9, 15, 17, 20], "encoder_dim": 9, "512": [9, 15, 17, 20], "nhead": [9, 15, 17, 20, 22, 35], "dim_feedforward": [9, 17], "2048": [9, 17], "num_encoder_lay": [9, 17], "decoder_dim": 9, "joiner_dim": 9, "model_warm_step": 9, "env_info": [9, 15, 17, 20], "releas": [9, 15, 17, 20], "sha1": [9, 15, 17, 20], "4810e00d8738f1a21278b0156a42ff396a2d40ac": 9, "date": [9, 15, 17, 20], "fri": 9, "oct": [9, 20], "miss": [9, 17, 20], "cu102": 9, "branch": [9, 15, 17, 20, 24], "1013": 9, "c39cba5": 9, "dirti": [9, 15, 20], "thu": [9, 17, 20, 23], "__init__": [9, 15, 17, 20], "jsonl": 9, "hostnam": [9, 17], "de": [9, 17], "74279": [9, 17], "0324160024": 9, "65bfd8b584": 9, "jjlbn": 9, "ip": [9, 17], "177": [9, 17, 18, 20], "203": [9, 20], "bpe_model": [9, 20], "sound_fil": [9, 15, 17, 20, 30], "sample_r": [9, 15, 17, 20, 30], "16000": [9, 15, 17, 18, 20, 23, 24, 27, 28], "beam": [9, 34], "max_context": 9, "max_stat": 9, "context_s": [9, 17], "max_sym_per_fram": [9, 17], "simulate_stream": 9, "decode_chunk_s": 9, "left_context": 9, "dynamic_chunk_train": 9, "causal_convolut": 9, "short_chunk_s": [9, 35], "25": [9, 15, 20, 22, 27, 28, 30, 35], "num_left_chunk": 9, "blank_id": [9, 17], "unk_id": 9, "vocab_s": [9, 17], "271": 9, "273": [9, 17], "612": 9, "458": 9, "disabl": 9, "giga": [9, 34], "623": 9, "277": 9, "paramet": [9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35], "78648040": 9, "951": [9, 20], "285": [9, 17, 20], "construct": [9, 15, 17, 18, 20, 23, 27, 28, 30], "952": 9, "295": [9, 15, 17, 18, 20], "957": 9, "700": 9, "329": [9, 20], "912": 9, "388": 9, "earli": [9, 20, 23], "nightfal": [9, 20, 23], "THE": [9, 20, 23], "yellow": [9, 20, 23], "lamp": [9, 20, 23], "light": [9, 20, 23], "here": [9, 15, 17, 18, 20, 23, 32, 35], "AND": [9, 20, 23], "THERE": [9, 20, 23], "squalid": [9, 20, 23], "quarter": [9, 20, 23], "OF": [9, 20, 23], "brothel": [9, 20, 23], "god": [9, 20, 23], "AS": [9, 20, 23], "direct": [9, 20, 23], "consequ": [9, 20, 23], "sin": [9, 20, 23], "man": [9, 20, 23], "punish": [9, 20, 23], "had": [9, 20, 23], "her": [9, 20, 23], "love": [9, 20, 23], "child": [9, 20, 23], "whose": [9, 17, 20, 23], "ON": [9, 20, 23], "THAT": [9, 20, 23], "dishonor": [9, 20, 23], "bosom": [9, 20, 23], "TO": [9, 20, 23], "parent": [9, 20, 23], "forev": [9, 20, 23], "WITH": [9, 20, 23], "race": [9, 20, 23], "descent": [9, 20, 23], "mortal": [9, 20, 23], "BE": [9, 20, 23], "bless": [9, 20, 23], "soul": [9, 20, 23], "IN": [9, 20, 23], "heaven": [9, 20, 23], "yet": [9, 20, 23], "THESE": [9, 20, 23], "thought": [9, 20, 23], "affect": [9, 20, 23], "hester": [9, 20, 23], "prynn": [9, 20, 23], "less": [9, 20, 23, 30, 35], "hope": [9, 16, 20, 23], "than": [9, 15, 17, 18, 20, 22, 23, 24, 25, 30, 34, 35], "apprehens": [9, 20, 23], "390": 9, "alwai": 9, "note": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "down": [9, 15, 20, 22, 24, 25, 34, 35], "reproduc": [9, 20], "ln": [9, 15, 20, 22, 24, 25, 34, 35], "9999": [9, 24, 25, 34], "symlink": 9, "pass": [9, 15, 17, 18, 20, 22, 24, 25, 32, 34, 35], "max": [9, 15, 17, 18, 20, 22, 24, 25, 34, 35], "durat": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "600": [9, 20, 22, 24, 34, 35], "reason": [9, 35], "support": [10, 11, 15, 17, 20, 22, 24, 25, 32, 34, 35], "perform": [10, 11, 17, 35], "raspberri": [10, 34], "pi": [10, 34], "project": 10, "static": [10, 34], "produc": [10, 22, 24, 25, 34, 35], "binari": [10, 15, 17, 18, 20, 22, 30, 34, 35], "everyth": 10, "tree": [11, 12, 13, 15, 17, 18, 20, 23, 27, 28, 30, 34], "insid": [11, 13], "encod": [11, 13, 15, 17, 18, 20, 22, 23, 24, 30, 32, 34, 35], "joiner": [11, 13, 17, 22, 34, 35], "joiner_encoder_proj": 11, "joiner_decoder_proj": 11, "onnx_pretrain": 11, "proj": 11, "baz": [11, 13], "onnxruntim": 11, "our": [12, 13, 20, 22, 32, 35], "torchscript": [12, 13, 34], "cpu_jit": [12, 15, 20, 22, 24, 25, 35], "confus": 12, "move": [12, 22, 24, 25, 35], "why": 12, "streaming_asr": [12, 13, 34, 35], "emform": 12, "conv_emform": 12, "offline_asr": [12, 22], "lstm_transducer_stateless2": [13, 34], "468000": [13, 34], "three": [13, 15, 17, 32], "encoder_jit_trac": [13, 34], "decoder_jit_trac": [13, 34], "joiner_jit_trac": [13, 34], "jit_pretrain": [13, 24, 25, 34], "tutori": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35], "learn": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "singl": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "1best": [15, 18, 20, 23, 24, 25, 27, 28], "handl": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "automag": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "stop": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "control": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "By": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "default": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "execut": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "mean": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35], "musan": [15, 18, 20, 22, 24, 25, 34, 35], "sai": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "thei": [15, 17, 18, 20, 22, 24, 25, 34, 35], "won": [15, 18, 20, 22, 24, 25, 34, 35], "re": [15, 18, 20, 22, 24, 25, 34, 35], "intal": [15, 18], "initi": [15, 18], "sudo": [15, 18], "apt": [15, 18], "permiss": [15, 18], "commandlin": [15, 17, 18, 20, 22, 24, 25, 34, 35], "quit": [15, 17, 18, 20, 22, 24, 25, 34, 35], "often": [15, 17, 18, 20, 22, 24, 25, 34, 35], "experi": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "num": [15, 17, 18, 20, 22, 24, 25, 34, 35], "state": [15, 17, 18, 20, 22, 24, 25, 34, 35], "world": [15, 17, 18, 20, 22, 23, 24, 25, 34, 35], "multi": [15, 17, 18, 20, 22, 24, 25, 32, 34, 35], "machin": [15, 17, 18, 20, 22, 24, 25, 34, 35], "ddp": [15, 17, 18, 20, 22, 24, 25, 34, 35], "implement": [15, 17, 18, 20, 22, 24, 25, 32, 34, 35], "present": [15, 17, 18, 20, 22, 24, 25, 34, 35], "later": [15, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35], "specifi": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "second": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "over": [15, 17, 18, 20, 22, 24, 25, 34, 35], "utter": [15, 17, 18, 20, 22, 24, 25, 34, 35], "pad": [15, 17, 18, 20, 22, 24, 25, 34, 35], "oom": [15, 17, 18, 20, 22, 24, 25, 34, 35], "v100": [15, 17, 18, 20], "nvidia": [15, 17, 18, 20], "due": [15, 17, 18, 20, 22, 24, 25, 34, 35], "usual": [15, 17, 18, 20, 22, 24, 25, 34, 35], "larger": [15, 17, 18, 20, 22, 24, 25, 34, 35], "caus": [15, 17, 18, 20, 22, 24, 25, 34, 35], "smaller": [15, 17, 18, 20, 22, 24, 25, 34, 35], "increas": [15, 17, 18, 20, 22, 24, 25, 34, 35], "tune": [15, 17, 18, 20, 22, 24, 25, 34, 35], "weight": [15, 18, 20, 24, 25, 34], "decai": [15, 18, 20, 24, 25, 34], "warmup": [15, 17, 18, 20, 22, 24, 25, 34, 35], "get_param": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "realli": [15, 18, 20, 22, 24, 25, 34, 35], "directli": [15, 17, 18, 20, 22, 24, 25, 34, 35], "perturb": [15, 17, 18, 20, 22, 24, 25, 34, 35], "speed": [15, 17, 18, 20, 22, 24, 25, 34, 35], "factor": [15, 17, 18, 20, 22, 24, 25, 34, 35], "3x150": [15, 17, 18], "hour": [15, 17, 18, 20, 22, 24, 25, 34, 35], "These": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "rate": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "visual": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "logdir": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "labelsmooth": 15, "someth": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "tensorflow": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "found": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "continu": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "press": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "ctrl": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "engw8ksktzqs24zbv5dgcg": 15, "22t11": 15, "scan": [15, 17, 18, 20, 22, 30, 34, 35], "116068": 15, "scalar": [15, 17, 18, 20, 22, 30, 34, 35], "listen": [15, 17, 18, 22, 30, 34, 35], "url": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "xxxx": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "text": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "saw": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "consol": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "typic": [15, 17, 18, 20], "avoid": [15, 17, 20], "commonli": [15, 17, 18, 20, 23, 27, 28, 30], "nbest": [15, 20, 25], "scale": [15, 20, 23, 25, 27, 28], "lattic": [15, 18, 20, 22, 23, 27, 28, 35], "score": [15, 20, 22, 35], "uniqu": [15, 20, 22, 35], "pkufool": [15, 18, 23], "icefall_asr_aishell_conformer_ctc": 15, "transcrib": [15, 17, 18, 20], "lang_char": [15, 17], "token": [15, 17, 18, 20, 23, 27, 28, 30], "word": [15, 17, 18, 20, 23, 27, 28, 30], "bac009s0764w0121": [15, 17, 18], "bac009s0764w0122": [15, 17, 18], "bac009s0764w0123": [15, 17, 18], "tran": [15, 18, 20, 23, 27, 28], "graph": [15, 18, 20, 22, 23, 27, 28, 35], "id": [15, 18, 20, 23, 27, 28], "conveni": [15, 18, 20], "eo": [15, 18, 20], "easili": [15, 18, 20], "obtain": [15, 17, 18, 20, 23, 27, 28], "84": 15, "list": [15, 17, 18, 20, 23, 27, 28], "soxi": [15, 17, 18, 20, 23, 30], "sampl": [15, 17, 18, 20, 23, 24, 30, 35], "precis": [15, 17, 18, 20, 22, 23, 30, 35], "bit": [15, 17, 18, 20, 23, 30], "04": [15, 17, 18, 20, 23, 27, 28], "67263": [15, 17, 18], "315": [15, 17, 18, 20, 23], "cdda": [15, 17, 18, 20, 23, 30], "sector": [15, 17, 18, 20, 23, 30], "135k": [15, 17, 18], "256k": [15, 17, 18, 20], "sign": [15, 17, 18, 20, 30], "integ": [15, 17, 18, 20, 30], "pcm": [15, 17, 18, 20, 30], "65840": [15, 17, 18], "625": [15, 17, 18], "132k": [15, 17, 18], "64000": [15, 17, 18], "300": [15, 17, 18, 20, 22, 35], "128k": [15, 17, 18, 30], "displai": [15, 17, 18, 20], "topologi": [15, 20], "07": [15, 17, 18, 20], "53": [15, 22, 23, 28, 34, 35], "707": [15, 20], "229": 15, "attention_dim": [15, 17, 20], "num_decoder_lay": [15, 20], "vgg_frontend": [15, 17, 20], "use_feat_batchnorm": [15, 20], "f2fd997f752ed11bbef4c306652c433e83f9cf12": 15, "sun": 15, "sep": 15, "46": [15, 20], "33cfe45": 15, "d57a873": 15, "wed": [15, 17, 20], "nov": [15, 20], "hw": 15, "kangwei": 15, "icefall_aishell3": 15, "k2_releas": 15, "tokens_fil": 15, "words_fil": [15, 20, 30], "num_path": [15, 20, 22, 35], "ngram_lm_scal": [15, 20], "attention_decoder_scal": [15, 20], "nbest_scal": [15, 20], "sos_id": [15, 20], "eos_id": [15, 20], "num_class": [15, 20, 30], "4336": [15, 17], "708": [15, 17, 20, 30], "242": [15, 20], "131": [15, 20], "134": 15, "269": [15, 27, 28], "275": 15, "241": 15, "293": [15, 20], "704": [15, 27], "369": [15, 20], "\u751a": [15, 17], "\u81f3": [15, 17], "\u51fa": [15, 17], "\u73b0": [15, 17], "\u4ea4": [15, 17], "\u6613": [15, 17], "\u51e0": [15, 17], "\u4e4e": [15, 17], "\u505c": [15, 17], "\u6b62": 15, "\u7684": [15, 17, 18], "\u60c5": [15, 17], "\u51b5": [15, 17], "\u4e00": [15, 17], "\u4e8c": [15, 17], "\u7ebf": [15, 17, 18], "\u57ce": [15, 17], "\u5e02": [15, 17], "\u867d": [15, 17], "\u7136": [15, 17], "\u4e5f": [15, 17, 18], "\u5904": [15, 17], "\u4e8e": [15, 17], "\u8c03": [15, 17], "\u6574": [15, 17], "\u4e2d": [15, 17, 18], "\u4f46": [15, 17, 18], "\u56e0": [15, 17], "\u4e3a": [15, 17], "\u805a": [15, 17], "\u96c6": [15, 17], "\u4e86": [15, 17, 18], "\u8fc7": [15, 17], "\u591a": [15, 17], "\u516c": [15, 17], "\u5171": [15, 17], "\u8d44": [15, 17], "\u6e90": [15, 17], "371": 15, "37": [15, 17, 20, 27], "38": [15, 17, 20, 27], "683": 15, "47": [15, 20], "651": [15, 30], "654": 15, "659": 15, "752": 15, "321": 15, "887": 15, "340": 15, "370": 15, "\u751a\u81f3": [15, 18], "\u51fa\u73b0": [15, 18], "\u4ea4\u6613": [15, 18], "\u51e0\u4e4e": [15, 18], "\u505c\u6b62": 15, "\u60c5\u51b5": [15, 18], "\u4e00\u4e8c": [15, 18], "\u57ce\u5e02": [15, 18], "\u867d\u7136": [15, 18], "\u5904\u4e8e": [15, 18], "\u8c03\u6574": [15, 18], "\u56e0\u4e3a": [15, 18], "\u805a\u96c6": [15, 18], "\u8fc7\u591a": [15, 18], "\u516c\u5171": [15, 18], "\u8d44\u6e90": [15, 18], "372": 15, "recor": [15, 20], "highest": [15, 20], "965": 15, "966": 15, "821": 15, "822": 15, "826": 15, "916": 15, "115": [15, 20], "345": 15, "888": 15, "889": 15, "limit": [15, 17, 20, 32, 35], "memori": [15, 17, 20, 32], "upgrad": [15, 20], "pro": [15, 20], "finish": [15, 17, 18, 20, 22, 23, 27, 28, 30, 35], "deploi": [15, 20], "At": [15, 20], "doe": [15, 17, 20, 30], "home": [15, 20], "checkout": [15, 20], "v2": [15, 20], "cmake": [15, 20], "dcmake_build_typ": [15, 20], "j": [15, 20], "hlg_decod": [15, 20], "four": [15, 20], "messag": [15, 20, 22, 24, 25, 34, 35], "nn_model": [15, 20], "use_gpu": [15, 20], "word_tabl": [15, 20], "caution": [15, 20], "forward": [15, 20, 24], "cu": [15, 20], "int": [15, 20], "char": [15, 20], "124": [15, 20], "98": 15, "142": [15, 18, 20], "150": [15, 20], "693": [15, 27], "165": [15, 20], "nnet_output": [15, 20], "182": [15, 23], "180": [15, 20], "489": 15, "45": [15, 17, 20], "216": [15, 20, 27, 28], "mandarin": 16, "corpu": 16, "beij": 16, "shell": 16, "technologi": 16, "ltd": 16, "400": 16, "peopl": 16, "accent": 16, "area": 16, "china": 16, "invit": 16, "particip": 16, "conduct": 16, "quiet": 16, "indoor": 16, "high": 16, "fidel": 16, "microphon": 16, "downsampl": 16, "16khz": 16, "manual": 16, "accuraci": 16, "through": 16, "profession": 16, "annot": 16, "strict": 16, "inspect": 16, "free": [16, 34], "academ": 16, "moder": 16, "amount": 16, "research": 16, "field": 16, "openslr": 16, "ctc": [16, 19, 21, 25, 26, 29], "stateless": [16, 19, 22, 34, 35], "instead": [17, 35], "rnn": [17, 22, 24, 34, 35], "As": [17, 20], "head": [17, 32], "dim": [17, 22, 35], "layer": [17, 22, 32, 34, 35], "feedforward": [17, 22, 35], "embed": [17, 22, 34, 35], "conv1d": [17, 22, 34, 35], "kernel": 17, "left": [17, 35], "context": [17, 22, 32, 34, 35], "nn": [17, 22, 24, 25, 34, 35], "tanh": 17, "linear": 17, "borrow": 17, "ieeexplor": 17, "ieee": 17, "stamp": 17, "jsp": 17, "arnumb": 17, "9054419": 17, "predict": [17, 22, 34, 35], "modif": 17, "right": [17, 32, 35], "charact": 17, "unit": 17, "vocabulari": 17, "87939824": 17, "88": 17, "optimized_transduc": 17, "extra": [17, 32, 35], "technqiu": 17, "propos": [17, 32], "improv": 17, "end": [17, 22, 24, 25, 30, 34, 35], "furthermor": 17, "maximum": 17, "emit": 17, "per": [17, 22, 35], "frame": [17, 22, 24, 35], "simplifi": [17, 32], "significantli": 17, "degrad": 17, "exactli": 17, "benchmark": 17, "unprun": 17, "advantag": 17, "minim": 17, "pruned_transducer_stateless": [17, 22, 32, 35], "altern": 17, "though": 17, "transducer_stateless_modifi": 17, "option": [17, 23, 27, 28, 30], "pr": 17, "gb": 17, "ram": 17, "small": [17, 27, 28, 30], "tri": 17, "prob": [17, 34], "appli": [17, 32], "configur": [17, 23, 27, 28, 30], "c": [17, 18, 22, 24, 25, 30, 34, 35], "lagz6hrcqxoigbfd5e0y3q": 17, "03t14": 17, "8477": 17, "sym": [17, 22, 35], "beam_search": [17, 22, 35], "decoding_method": 17, "beam_4": 17, "28": [17, 20, 23], "ensur": 17, "give": 17, "poor": 17, "531": [17, 18], "994": [17, 20], "176": [17, 20], "027": 17, "encoder_out_dim": 17, "f4fefe4882bc0ae59af951da3f47335d5495ef71": 17, "feb": 17, "50d2281": 17, "mar": 17, "0815224919": 17, "75d558775b": 17, "mmnv8": 17, "72": [17, 20], "878": [17, 28], "257": [17, 27, 28], "880": 17, "267": [17, 27, 28], "891": 17, "__floordiv__": 17, "length": [17, 35], "x_len": 17, "163": [17, 20], "320": 17, "\u6ede": 17, "322": 17, "759": 17, "760": 17, "919": 17, "922": 17, "046": 17, "047": 17, "319": [17, 20], "214": [17, 20], "215": [17, 20, 23], "402": 17, "topk_hyp_index": 17, "topk_index": 17, "logit": 17, "583": [17, 28], "2000": 18, "lji9mwuorlow3jkdhxwk8a": 18, "13t11": 18, "4454": 18, "icefall_asr_aishell_tdnn_lstm_ctc": 18, "858": [18, 20], "389": [18, 20], "154": 18, "161": [18, 20], "536": 18, "171": [18, 20, 27, 28], "539": 18, "917": 18, "207": [18, 20], "129": 18, "\u505c\u6ede": 18, "222": [18, 20], "statelessx": [19, 21, 31, 32, 33], "zipform": [19, 21], "mmi": [19, 21], "blank": [19, 21], "skip": [19, 21, 22, 34, 35], "ligru": [19, 26], "full": [20, 22, 24, 25, 34, 35], "libri": [20, 22, 24, 25, 34, 35], "960": [20, 22, 24, 25, 34, 35], "subset": [20, 22, 24, 25, 34, 35], "3x960": [20, 22, 24, 25, 34, 35], "2880": [20, 22, 24, 25, 34, 35], "lzgnetjwrxc3yghnmd4kpw": 20, "24t16": 20, "43": 20, "4540": 20, "sentenc": 20, "piec": 20, "And": [20, 22, 24, 25, 34, 35], "neither": 20, "nor": 20, "vocab": 20, "work": 20, "5000": 20, "44": [20, 27, 28], "033": 20, "538": 20, "full_libri": 20, "406": 20, "464": 20, "548": 20, "776": 20, "652": [20, 30], "109226120": 20, "714": [20, 27], "473": 20, "944": 20, "1328": 20, "54": [20, 23, 27, 28], "443": [20, 23], "2563": 20, "56": [20, 27], "494": 20, "592": 20, "331": [20, 23], "1715": 20, "52576": 20, "1424": 20, "807": 20, "506": 20, "808": [20, 27], "522": 20, "362": 20, "565": 20, "1477": 20, "106": 20, "2922": 20, "208": 20, "4295": 20, "52343": 20, "396": 20, "3584": 20, "433": 20, "680": [20, 27], "_pickl": 20, "unpicklingerror": 20, "kei": 20, "hlg_modifi": 20, "g_4_gram": [20, 23, 27, 28], "106000": [20, 23], "496": [20, 23], "875": [20, 23], "212k": 20, "267440": [20, 23], "1253": [20, 23], "535k": 20, "83": [20, 23], "77200": [20, 23], "361": [20, 23], "154k": 20, "554": 20, "260": 20, "7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4": 20, "tue": 20, "8d93169": 20, "266": [20, 23], "268": [20, 23], "601": 20, "758": 20, "025": 20, "204": 20, "425": 20, "broffel": 20, "osom": 20, "427": 20, "723": 20, "775": 20, "881": 20, "352": 20, "234": 20, "384": 20, "whole": [20, 23, 27, 28, 35], "ngram": [20, 23, 27, 28], "857": 20, "979": 20, "980": 20, "055": 20, "117": 20, "051": 20, "363": 20, "959": [20, 28], "546": 20, "598": 20, "599": [20, 23], "833": 20, "834": 20, "915": 20, "076": 20, "110": 20, "397": 20, "999": [20, 22, 35], "concaten": 20, "bucket": 20, "sampler": 20, "1000": 20, "ctc_decod": 20, "ngram_lm_rescor": 20, "attention_rescor": 20, "kind": [20, 22, 24, 25, 34, 35], "316": 20, "118": 20, "58": 20, "221": 20, "125": [20, 30], "136": 20, "144": 20, "159": [20, 30], "543": 20, "174": 20, "topo": 20, "547": 20, "729": 20, "111": 20, "702": 20, "703": 20, "545": 20, "122": 20, "280": 20, "135": [20, 30], "153": [20, 30], "945": 20, "475": 20, "191": [20, 27, 28], "398": 20, "199": [20, 23], "515": 20, "205": 20, "w": [20, 27, 28], "deseri": 20, "441": 20, "fsaclass": 20, "loadfsa": 20, "const": 20, "string": 20, "c10": 20, "ignor": 20, "attribut": 20, "dummi": 20, "589": 20, "attention_scal": 20, "656": 20, "162": 20, "169": [20, 27, 28], "188": 20, "624": 20, "519": [20, 28], "632": 20, "645": [20, 30], "243": 20, "970": 20, "303": 20, "179": 20, "suitabl": [22, 34, 35], "pruned_transducer_stateless2": [22, 32, 35], "pruned_transducer_stateless4": [22, 32, 35], "pruned_transducer_stateless5": [22, 32, 35], "scroll": [22, 24, 25, 34, 35], "scratch": [22, 24, 25, 34, 35], "paper": [22, 34, 35], "arxiv": [22, 34, 35], "ab": [22, 34, 35], "2206": [22, 34, 35], "13236": [22, 34, 35], "rework": [22, 32, 35], "daniel": [22, 35], "joint": [22, 34, 35], "contrari": [22, 34, 35], "convent": [22, 34, 35], "That": [22, 34, 35], "recurr": [22, 34, 35], "fp16": [22, 24, 25, 34, 35], "half": [22, 35], "2x": [22, 35], "dimens": [22, 35], "littl": [22, 35], "allow": [22, 35], "436000": [22, 24, 25, 34, 35], "438000": [22, 24, 25, 34, 35], "qogspbgsr8kzcrmmie9jgw": 22, "20t15": [22, 34, 35], "4468": [22, 34, 35], "210171": [22, 34, 35], "access": [22, 24, 25, 34, 35], "6008": [22, 24, 25, 34, 35], "localhost": [22, 24, 25, 34, 35], "expos": [22, 24, 25, 34, 35], "proxi": [22, 24, 25, 34, 35], "bind_al": [22, 24, 25, 34, 35], "suggest": [22, 24, 25, 34, 35], "both": [22, 24, 25, 34, 35], "lowest": [22, 24, 25, 34, 35], "fast_beam_search": [22, 24, 34, 35], "474000": [22, 34, 35], "largest": [22, 35], "posterior": [22, 24, 35], "algorithm": [22, 35], "pdf": [22, 25, 35], "1211": [22, 35], "3711": [22, 35], "espnet": [22, 35], "net": [22, 35], "beam_search_transduc": [22, 35], "basicli": [22, 35], "topk": [22, 35], "expand": [22, 35], "mode": [22, 35], "being": [22, 35], "hardcod": [22, 35], "composit": [22, 35], "between": [22, 35], "log_prob": [22, 35], "hard": [22, 32, 35], "2211": [22, 35], "00484": [22, 35], "rnnt": [22, 35], "effici": [22, 35], "fast_beam_search_lg": [22, 35], "trivial": [22, 35], "fast_beam_search_nbest": [22, 35], "random_path": [22, 35], "shortest": [22, 35], "fast_beam_search_nbest_lg": [22, 35], "logic": [22, 35], "includ": [22, 24, 25, 34, 35], "But": [22, 24, 25, 34, 35], "smallest": [22, 34, 35], "icefall_asr_librispeech_tdnn": 23, "lstm_ctc": 23, "flac": 23, "116k": 23, "140k": 23, "343k": 23, "164k": 23, "105k": 23, "174k": 23, "pretraind": 23, "168": 23, "170": 23, "581": 23, "584": [23, 28], "209": 23, "791": 23, "245": 23, "099": 23, "methond": [23, 27, 28], "725": 23, "403": 23, "631": 23, "010": 23, "guidanc": 24, "calcul": [24, 35], "bigger": 24, "threshold": 24, "simpli": 24, "discard": 24, "prevent": 24, "convolut": [24, 32, 35], "similar": [24, 35], "lconv": 24, "encourag": [24, 25, 34], "stabil": [24, 25], "doesn": 24, "warm": [24, 25], "pruned_transducer_stateless7_ctc_b": 24, "xyozukpeqm62hbilud4upa": [24, 25], "ctc_guild_decode_b": 24, "pretrained_ctc": 24, "jit_pretrained_ctc": 24, "yfyeung": 24, "wechat": 25, "zipformer_mmi": 25, "worker": [25, 34], "hp": 25, "zengwei": [25, 34], "tdnn_ligru_ctc": 27, "enough": [27, 28, 30], "luomingshuang": [27, 28], "icefall_asr_timit_tdnn_ligru_ctc": 27, "pretrained_average_9_25": 27, "fdhc0_si1559": [27, 28], "felc0_si756": [27, 28], "fmgd0_si1564": [27, 28], "ffprobe": [27, 28], "show_format": [27, 28], "nistspher": [27, 28], "database_id": [27, 28], "database_vers": [27, 28], "utterance_id": [27, 28], "dhc0_si1559": [27, 28], "sample_min": [27, 28], "4176": [27, 28], "sample_max": [27, 28], "5984": [27, 28], "bitrat": [27, 28], "258": [27, 28], "audio": [27, 28], "pcm_s16le": [27, 28], "hz": [27, 28], "s16": [27, 28], "256": [27, 28], "elc0_si756": [27, 28], "1546": [27, 28], "1989": [27, 28], "mgd0_si1564": [27, 28], "7626": [27, 28], "10573": [27, 28], "660": 27, "183": [27, 28], "695": 27, "697": 27, "210": [27, 28], "829": 27, "sil": [27, 28], "dh": [27, 28], "ih": [27, 28], "uw": [27, 28], "ah": [27, 28], "ii": [27, 28], "z": [27, 28], "aa": [27, 28], "ei": [27, 28], "dx": [27, 28], "uh": [27, 28], "ng": [27, 28], "th": [27, 28], "eh": [27, 28], "jh": [27, 28], "er": [27, 28], "ai": [27, 28], "hh": [27, 28], "aw": 27, "ae": [27, 28], "705": 27, "715": 27, "720": 27, "251": [27, 28], "348": 27, "ch": 27, "icefall_asr_timit_tdnn_lstm_ctc": 28, "pretrained_average_16_25": 28, "816": 28, "827": 28, "387": 28, "unk": 28, "739": 28, "971": 28, "977": 28, "978": 28, "981": 28, "ow": 28, "ykubhb5wrmosxykid1z9eg": 30, "23t23": 30, "sinc": [30, 34], "icefall_asr_yesno_tdnn": 30, "l_disambig": 30, "lexicon_disambig": 30, "arpa": 30, "0_0_0_1_0_0_0_1": 30, "0_0_1_0_0_0_1_0": 30, "0_0_1_0_0_1_1_1": 30, "0_0_1_0_1_0_0_1": 30, "0_0_1_1_0_0_0_1": 30, "0_0_1_1_0_1_1_0": 30, "0_0_1_1_1_0_0_0": 30, "0_0_1_1_1_1_0_0": 30, "0_1_0_0_0_1_0_0": 30, "0_1_0_0_1_0_1_0": 30, "0_1_0_1_0_0_0_0": 30, "0_1_0_1_1_1_0_0": 30, "0_1_1_0_0_1_1_1": 30, "0_1_1_1_0_0_1_0": 30, "0_1_1_1_1_0_1_0": 30, "1_0_0_0_0_0_0_0": 30, "1_0_0_0_0_0_1_1": 30, "1_0_0_1_0_1_1_1": 30, "1_0_1_1_0_1_1_1": 30, "1_0_1_1_1_1_0_1": 30, "1_1_0_0_0_1_1_1": 30, "1_1_0_0_1_0_1_1": 30, "1_1_0_1_0_1_0_0": 30, "1_1_0_1_1_0_0_1": 30, "1_1_0_1_1_1_1_0": 30, "1_1_1_0_0_1_0_1": 30, "1_1_1_0_1_0_1_0": 30, "1_1_1_1_0_0_1_0": 30, "1_1_1_1_1_0_0_0": 30, "1_1_1_1_1_1_1_1": 30, "54080": 30, "507": 30, "108k": 30, "No": 30, "ye": 30, "hebrew": 30, "NO": 30, "621": 30, "119": 30, "127": 30, "650": 30, "139": 30, "143": 30, "198": 30, "181": 30, "186": 30, "187": 30, "287": 30, "correctli": 30, "simplest": 30, "former": 32, "idea": 32, "achiev": 32, "mask": [32, 35], "wenet": 32, "did": 32, "argument": 32, "adapt": 32, "complic": 32, "techniqu": 32, "bank": 32, "compon": 32, "memor": 32, "histori": 32, "introduc": 32, "variant": 32, "pruned_stateless_emformer_rnnt2": 32, "conv_emformer_transducer_stateless": 32, "convemform": 32, "ourself": 32, "mechan": 32, "conv_emformer_transducer_stateless2": 32, "onlin": 34, "lstm_transducer_stateless": 34, "architectur": 34, "lower": 34, "prepare_giga_speech": 34, "cj2vtpiwqhkn9q1tx6ptpg": 34, "hidden": 34, "1024": 34, "pnnx": 34, "submodul": 34, "updat": 34, "recurs": 34, "init": 34, "bdist_wheel": 34, "lh": 34, "dist": 34, "j4": 34, "pwd": 34, "third": 34, "param": 34, "abl": 34, "extern": 34, "stateless2": 34, "compar": 35, "dynam": 35, "chunk": 35, "causal": 35, "short": 35, "2012": 35, "05481": 35, "flag": 35, "indic": 35, "whether": 35, "must": 35, "sequenc": 35, "uniformli": 35, "most": 35, "seen": 35, "97vkxf80ru61cnp2alwzzg": 35, "streaming_decod": 35, "acoust": 35, "wise": 35, "subsampl": 35, "equal": 35, "where": 35, "parallel": 35, "bath": 35, "parallelli": 35, "seem": 35, "benefit": 35, "might": 35, "mismatch": 35, "yourself": 35, "mdoel": 35, "task": 36}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"follow": 0, "code": 0, "style": 0, "contribut": [1, 3], "document": 1, "how": [2, 9, 11, 12, 13], "creat": [2, 8], "recip": [2, 36], "data": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "prepar": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "train": [2, 5, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "decod": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "pre": [2, 5, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "model": [2, 5, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "huggingfac": [4, 6], "space": 6, "youtub": [6, 8], "video": [6, 8], "icefal": [7, 8], "content": [7, 36], "instal": [8, 15, 17, 18, 20, 23, 27, 28], "0": 8, "pytorch": 8, "torchaudio": 8, "1": [8, 15, 17, 18, 20], "k2": 8, "2": [8, 15, 17, 18, 20], "lhots": 8, "3": [8, 15, 17, 20], "download": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "exampl": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35], "virtual": 8, "environ": 8, "activ": 8, "your": 8, "4": 8, "5": 8, "test": 8, "export": [9, 10, 11, 12, 13, 14, 22, 24, 25, 34, 35], "state_dict": [9, 22, 24, 25, 34, 35], "when": [9, 11, 12, 13], "us": [9, 11, 12, 13, 22, 24, 25, 34, 35], "run": 9, "py": 9, "ncnn": [10, 34], "onnx": 11, "torch": [12, 13, 22, 24, 25, 34, 35], "jit": [12, 13, 22, 24, 25, 34, 35], "script": [12, 22, 24, 25, 35], "trace": [13, 34], "conform": [15, 20, 32], "ctc": [15, 18, 20, 23, 24, 27, 28, 30], "configur": [15, 18, 20, 22, 24, 25, 34, 35], "option": [15, 18, 20, 22, 24, 25, 34, 35], "log": [15, 17, 18, 20, 22, 24, 25, 34, 35], "usag": [15, 17, 18, 20, 22, 24, 25, 34, 35], "case": [15, 17, 18, 20], "kaldifeat": [15, 17, 18, 20, 23, 27, 28, 30], "hlg": [15, 18, 20], "attent": [15, 20], "rescor": [15, 20], "colab": [15, 17, 18, 20, 23, 27, 28, 30], "notebook": [15, 17, 18, 20, 23, 27, 28, 30], "deploy": [15, 20], "c": [15, 20], "aishel": 16, "stateless": 17, "transduc": [17, 22, 34, 35], "The": 17, "loss": 17, "todo": 17, "greedi": 17, "search": 17, "beam": 17, "modifi": 17, "tdnn": [18, 23, 27, 28, 30], "lstm": [18, 23, 28, 34], "non": 19, "stream": [19, 31, 32, 35], "asr": [19, 31], "lm": 20, "comput": 20, "wer": 20, "n": 20, "gram": 20, "librispeech": [21, 33], "prune": [22, 35], "statelessx": [22, 35], "pretrain": [22, 24, 25, 34, 35], "deploi": [22, 35], "sherpa": [22, 35], "infer": [23, 27, 28, 30], "zipform": [24, 25], "blank": 24, "skip": 24, "mmi": 25, "timit": 26, "ligru": 27, "yesno": 29, "introduct": 32, "emform": 32, "which": 34, "simul": 35, "real": 35, "tabl": 36}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx": 57}, "alltitles": {"Follow the code style": [[0, "follow-the-code-style"]], "Contributing to Documentation": [[1, "contributing-to-documentation"]], "How to create a recipe": [[2, "how-to-create-a-recipe"]], "Data Preparation": [[2, "data-preparation"], [17, "data-preparation"]], "Training": [[2, "training"], [8, "training"], [15, "training"], [17, "training"], [18, "training"], [20, "training"], [22, "training"], [23, "training"], [24, "training"], [25, "training"], [27, "training"], [28, "training"], [30, "training"], [34, "training"], [35, "training"]], "Decoding": [[2, "decoding"], [8, "decoding"], [15, "decoding"], [17, "decoding"], [18, "decoding"], [20, "decoding"], [22, "decoding"], [23, "decoding"], [24, "decoding"], [25, "decoding"], [27, "decoding"], [28, "decoding"], [30, "decoding"], [34, "decoding"], [35, "decoding"]], "Pre-trained model": [[2, "pre-trained-model"]], "Contributing": [[3, "contributing"]], "Huggingface": [[4, "huggingface"]], "Pre-trained models": [[5, "pre-trained-models"]], "Huggingface spaces": [[6, "huggingface-spaces"]], "YouTube Video": [[6, "youtube-video"], [8, "youtube-video"]], "Icefall": [[7, "icefall"]], "Contents:": [[7, null]], "Installation": [[8, "installation"]], "(0) Install PyTorch and torchaudio": [[8, "install-pytorch-and-torchaudio"]], "(1) Install k2": [[8, "install-k2"]], "(2) Install lhotse": [[8, "install-lhotse"]], "(3) Download icefall": [[8, "download-icefall"]], "Installation example": [[8, "installation-example"]], "(1) Create a virtual environment": [[8, "create-a-virtual-environment"]], "(2) Activate your virtual environment": [[8, "activate-your-virtual-environment"]], "(3) Install k2": [[8, "id1"]], "(4) Install lhotse": [[8, "id2"]], "(5) Download icefall": [[8, "id3"]], "Test Your Installation": [[8, "test-your-installation"]], "Data preparation": [[8, "data-preparation"], [15, "data-preparation"], [18, "data-preparation"], [20, "data-preparation"], [22, "data-preparation"], [23, "data-preparation"], [24, "data-preparation"], [25, "data-preparation"], [27, "data-preparation"], [28, "data-preparation"], [30, "data-preparation"], [34, "data-preparation"], [35, "data-preparation"]], "Export model.state_dict()": [[9, "export-model-state-dict"], [22, "export-model-state-dict"], [24, "export-model-state-dict"], [25, "export-model-state-dict"], [34, "export-model-state-dict"], [35, "export-model-state-dict"]], "When to use it": [[9, "when-to-use-it"], [11, "when-to-use-it"], [12, "when-to-use-it"], [13, "when-to-use-it"]], "How to export": [[9, "how-to-export"], [11, "how-to-export"], [12, "how-to-export"], [13, "how-to-export"]], "How to use the exported model": [[9, "how-to-use-the-exported-model"], [11, "how-to-use-the-exported-model"], [12, "how-to-use-the-exported-model"]], "Use the exported model to run decode.py": [[9, "use-the-exported-model-to-run-decode-py"]], "Export to ncnn": [[10, "export-to-ncnn"]], "Export to ONNX": [[11, "export-to-onnx"]], "Export model with torch.jit.script()": [[12, "export-model-with-torch-jit-script"]], "Export model with torch.jit.trace()": [[13, "export-model-with-torch-jit-trace"]], "How to use the exported models": [[13, "how-to-use-the-exported-models"]], "Model export": [[14, "model-export"]], "Conformer CTC": [[15, "conformer-ctc"], [20, "conformer-ctc"]], "Configurable options": [[15, "configurable-options"], [18, "configurable-options"], [20, "configurable-options"], [22, "configurable-options"], [24, "configurable-options"], [25, "configurable-options"], [34, "configurable-options"], [35, "configurable-options"]], "Pre-configured options": [[15, "pre-configured-options"], [18, "pre-configured-options"], [20, "pre-configured-options"], [22, "pre-configured-options"], [24, "pre-configured-options"], [25, "pre-configured-options"], [34, "pre-configured-options"], [35, "pre-configured-options"]], "Training logs": [[15, "training-logs"], [17, "training-logs"], [18, "training-logs"], [20, "training-logs"], [22, "training-logs"], [24, "training-logs"], [25, "training-logs"], [34, "training-logs"], [35, "training-logs"]], "Usage examples": [[15, "usage-examples"], [17, "usage-examples"], [18, "usage-examples"], [20, "usage-examples"]], "Case 1": [[15, "case-1"], [17, "case-1"], [18, "case-1"], [20, "case-1"]], "Case 2": [[15, "case-2"], [17, "case-2"], [18, "case-2"], [20, "case-2"]], "Case 3": [[15, "case-3"], [17, "case-3"], [20, "case-3"]], "Pre-trained Model": [[15, "pre-trained-model"], [17, "pre-trained-model"], [18, "pre-trained-model"], [20, "pre-trained-model"], [23, "pre-trained-model"], [27, "pre-trained-model"], [28, "pre-trained-model"], [30, "pre-trained-model"]], "Install kaldifeat": [[15, "install-kaldifeat"], [17, "install-kaldifeat"], [18, "install-kaldifeat"], [20, "install-kaldifeat"], [23, "install-kaldifeat"], [27, "install-kaldifeat"], [28, "install-kaldifeat"]], "Download the pre-trained model": [[15, "download-the-pre-trained-model"], [17, "download-the-pre-trained-model"], [18, "download-the-pre-trained-model"], [20, "download-the-pre-trained-model"], [23, "download-the-pre-trained-model"], [27, "download-the-pre-trained-model"], [28, "download-the-pre-trained-model"], [30, "download-the-pre-trained-model"]], "Usage": [[15, "usage"], [17, "usage"], [18, "usage"], [20, "usage"]], "CTC decoding": [[15, "ctc-decoding"], [20, "ctc-decoding"], [20, "id2"]], "HLG decoding": [[15, "hlg-decoding"], [15, "id2"], [18, "hlg-decoding"], [20, "hlg-decoding"], [20, "id3"]], "HLG decoding + attention decoder rescoring": [[15, "hlg-decoding-attention-decoder-rescoring"]], "Colab notebook": [[15, "colab-notebook"], [17, "colab-notebook"], [18, "colab-notebook"], [20, "colab-notebook"], [23, "colab-notebook"], [27, "colab-notebook"], [28, "colab-notebook"], [30, "colab-notebook"]], "Deployment with C++": [[15, "deployment-with-c"], [20, "deployment-with-c"]], "aishell": [[16, "aishell"]], "Stateless Transducer": [[17, "stateless-transducer"]], "The Model": [[17, "the-model"]], "The Loss": [[17, "the-loss"]], "Todo": [[17, "id1"]], "Greedy search": [[17, "greedy-search"]], "Beam search": [[17, "beam-search"]], "Modified Beam search": [[17, "modified-beam-search"]], "TDNN-LSTM CTC": [[18, "tdnn-lstm-ctc"]], "Non Streaming ASR": [[19, "non-streaming-asr"]], "HLG decoding + LM rescoring": [[20, "hlg-decoding-lm-rescoring"]], "HLG decoding + LM rescoring + attention decoder rescoring": [[20, "hlg-decoding-lm-rescoring-attention-decoder-rescoring"]], "Compute WER with the pre-trained model": [[20, "compute-wer-with-the-pre-trained-model"]], "HLG decoding + n-gram LM rescoring": [[20, "hlg-decoding-n-gram-lm-rescoring"]], "HLG decoding + n-gram LM rescoring + attention decoder rescoring": [[20, "hlg-decoding-n-gram-lm-rescoring-attention-decoder-rescoring"]], "LibriSpeech": [[21, "librispeech"], [33, "librispeech"]], "Pruned transducer statelessX": [[22, "pruned-transducer-statelessx"], [35, "pruned-transducer-statelessx"]], "Usage example": [[22, "usage-example"], [24, "usage-example"], [25, "usage-example"], [34, "usage-example"], [35, "usage-example"]], "Export Model": [[22, "export-model"], [35, "export-model"]], "Export model using torch.jit.script()": [[22, "export-model-using-torch-jit-script"], [24, "export-model-using-torch-jit-script"], [25, "export-model-using-torch-jit-script"], [35, "export-model-using-torch-jit-script"]], "Download pretrained models": [[22, "download-pretrained-models"], [24, "download-pretrained-models"], [25, "download-pretrained-models"], [34, "download-pretrained-models"], [35, "download-pretrained-models"]], "Deploy with Sherpa": [[22, "deploy-with-sherpa"], [35, "deploy-with-sherpa"]], "TDNN-LSTM-CTC": [[23, "tdnn-lstm-ctc"], [28, "tdnn-lstm-ctc"]], "Inference with a pre-trained model": [[23, "inference-with-a-pre-trained-model"], [27, "inference-with-a-pre-trained-model"], [28, "inference-with-a-pre-trained-model"], [30, "inference-with-a-pre-trained-model"]], "Zipformer CTC Blank Skip": [[24, "zipformer-ctc-blank-skip"]], "Export models": [[24, "export-models"], [25, "export-models"], [34, "export-models"]], "Zipformer MMI": [[25, "zipformer-mmi"]], "TIMIT": [[26, "timit"]], "TDNN-LiGRU-CTC": [[27, "tdnn-ligru-ctc"]], "YesNo": [[29, "yesno"]], "TDNN-CTC": [[30, "tdnn-ctc"]], "Download kaldifeat": [[30, "download-kaldifeat"]], "Streaming ASR": [[31, "streaming-asr"]], "Introduction": [[32, "introduction"]], "Streaming Conformer": [[32, "streaming-conformer"]], "Streaming Emformer": [[32, "streaming-emformer"]], "LSTM Transducer": [[34, "lstm-transducer"]], "Which model to use": [[34, "which-model-to-use"]], "Export model using torch.jit.trace()": [[34, "export-model-using-torch-jit-trace"]], "Export model for ncnn": [[34, "export-model-for-ncnn"]], "Simulate streaming decoding": [[35, "simulate-streaming-decoding"]], "Real streaming decoding": [[35, "real-streaming-decoding"]], "Recipes": [[36, "recipes"]], "Table of Contents": [[36, null]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["contributing/code-style", "contributing/doc", "contributing/how-to-create-a-recipe", "contributing/index", "huggingface/index", "huggingface/pretrained-models", "huggingface/spaces", "index", "installation/index", "model-export/export-model-state-dict", "model-export/export-ncnn", "model-export/export-onnx", "model-export/export-with-torch-jit-script", "model-export/export-with-torch-jit-trace", "model-export/index", "recipes/Non-streaming-ASR/aishell/conformer_ctc", "recipes/Non-streaming-ASR/aishell/index", "recipes/Non-streaming-ASR/aishell/stateless_transducer", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/index", "recipes/Non-streaming-ASR/librispeech/conformer_ctc", "recipes/Non-streaming-ASR/librispeech/index", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi", "recipes/Non-streaming-ASR/timit/index", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/yesno/index", "recipes/Non-streaming-ASR/yesno/tdnn", "recipes/Streaming-ASR/index", "recipes/Streaming-ASR/introduction", "recipes/Streaming-ASR/librispeech/index", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/Streaming-ASR/librispeech/zipformer_transducer", "recipes/index"], "filenames": ["contributing/code-style.rst", "contributing/doc.rst", "contributing/how-to-create-a-recipe.rst", "contributing/index.rst", "huggingface/index.rst", "huggingface/pretrained-models.rst", "huggingface/spaces.rst", "index.rst", "installation/index.rst", "model-export/export-model-state-dict.rst", "model-export/export-ncnn.rst", "model-export/export-onnx.rst", "model-export/export-with-torch-jit-script.rst", "model-export/export-with-torch-jit-trace.rst", "model-export/index.rst", "recipes/Non-streaming-ASR/aishell/conformer_ctc.rst", "recipes/Non-streaming-ASR/aishell/index.rst", "recipes/Non-streaming-ASR/aishell/stateless_transducer.rst", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/index.rst", "recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst", "recipes/Non-streaming-ASR/librispeech/index.rst", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst", "recipes/Non-streaming-ASR/timit/index.rst", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/yesno/index.rst", "recipes/Non-streaming-ASR/yesno/tdnn.rst", "recipes/Streaming-ASR/index.rst", "recipes/Streaming-ASR/introduction.rst", "recipes/Streaming-ASR/librispeech/index.rst", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/Streaming-ASR/librispeech/zipformer_transducer.rst", "recipes/index.rst"], "titles": ["Follow the code style", "Contributing to Documentation", "How to create a recipe", "Contributing", "Huggingface", "Pre-trained models", "Huggingface spaces", "Icefall", "Installation", "Export model.state_dict()", "Export to ncnn", "Export to ONNX", "Export model with torch.jit.script()", "Export model with torch.jit.trace()", "Model export", "Conformer CTC", "aishell", "Stateless Transducer", "TDNN-LSTM CTC", "Non Streaming ASR", "Conformer CTC", "LibriSpeech", "Pruned transducer statelessX", "TDNN-LSTM-CTC", "Zipformer CTC Blank Skip", "Zipformer MMI", "TIMIT", "TDNN-LiGRU-CTC", "TDNN-LSTM-CTC", "YesNo", "TDNN-CTC", "Streaming ASR", "Introduction", "LibriSpeech", "LSTM Transducer", "Pruned transducer statelessX", "Zipformer Transducer", "Recipes"], "terms": {"we": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36, 37], "us": [0, 1, 2, 4, 6, 7, 8, 10, 14, 15, 16, 17, 18, 20, 23, 27, 28, 30, 32], "tool": [0, 34], "make": [0, 1, 3, 15, 17, 20, 32, 34], "consist": [0, 17, 22, 34, 35, 36], "possibl": [0, 2, 3, 8, 15, 20], "black": 0, "format": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "flake8": 0, "check": [0, 20], "qualiti": [0, 16], "isort": 0, "sort": [0, 8], "import": [0, 35, 36], "The": [0, 1, 2, 6, 8, 9, 10, 15, 16, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "version": [0, 8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 34, 35], "abov": [0, 8, 9, 15, 16, 17, 18, 20, 22, 24, 25, 30, 32, 34, 35, 36], "ar": [0, 1, 3, 8, 9, 11, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36, 37], "22": [0, 20, 27, 28, 30], "3": [0, 7, 9, 18, 22, 23, 24, 25, 30, 34, 35, 36], "0": [0, 1, 7, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "5": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "4": [0, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "10": [0, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "1": [0, 7, 9, 11, 12, 13, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "after": [0, 1, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "run": [0, 2, 6, 8, 11, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "command": [0, 1, 8, 9, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "git": [0, 8, 9, 15, 17, 18, 20, 23, 27, 28, 30, 34], "clone": [0, 8, 9, 15, 17, 18, 20, 23, 27, 28, 30, 34], "http": [0, 1, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "github": [0, 2, 5, 8, 9, 10, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "com": [0, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "k2": [0, 2, 5, 6, 7, 9, 10, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35, 36], "fsa": [0, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 17, 20, 22, 24, 25, 34, 35, 36], "icefal": [0, 2, 3, 5, 6, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36, 37], "cd": [0, 1, 2, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "pip": [0, 1, 8, 17, 34], "instal": [0, 1, 4, 6, 7, 9, 22, 24, 25, 30, 34, 35, 36], "pre": [0, 3, 4, 6, 7, 8], "commit": 0, "whenev": 0, "you": [0, 1, 2, 5, 6, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "automat": [0, 6], "hook": 0, "invok": 0, "fail": [0, 8], "If": [0, 2, 6, 8, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "ani": [0, 8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "your": [0, 1, 2, 4, 6, 7, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "wa": [0, 8, 9, 20, 23], "success": [0, 8], "pleas": [0, 1, 2, 6, 8, 10, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "fix": [0, 8, 20], "issu": [0, 8, 20, 35, 36], "report": [0, 8], "some": [0, 1, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "i": [0, 1, 2, 6, 8, 9, 10, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "e": [0, 2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "modifi": [0, 15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "file": [0, 2, 6, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "place": [0, 8, 9, 17, 20, 23], "so": [0, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "statu": 0, "failur": 0, "see": [0, 1, 6, 8, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "which": [0, 2, 6, 9, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 35, 36], "ha": [0, 2, 10, 11, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 32, 34, 35, 36], "been": [0, 10, 11, 17], "befor": [0, 1, 9, 12, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "further": 0, "chang": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "all": [0, 5, 6, 9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "again": [0, 30], "should": [0, 2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "succe": 0, "thi": [0, 2, 3, 4, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36, 37], "time": [0, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "succeed": 0, "want": [0, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "can": [0, 1, 2, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "do": [0, 2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "Or": 0, "without": [0, 4, 6, 15, 20, 34], "your_changed_fil": 0, "py": [0, 2, 8, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "sphinx": 1, "write": [1, 2, 3], "have": [1, 2, 5, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "prepar": [1, 3, 9], "environ": [1, 15, 16, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35, 36], "doc": [1, 9], "r": [1, 8, 27, 28], "requir": [1, 8, 35, 36], "txt": [1, 8, 15, 17, 18, 20, 23, 27, 28, 30], "set": [1, 8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "up": [1, 8, 9, 15, 18, 20, 22, 23, 24, 25, 35, 36], "readi": [1, 15, 20], "refer": [1, 2, 8, 9, 10, 12, 13, 15, 17, 18, 20, 22, 23, 24, 27, 28, 30, 32, 35, 36], "restructuredtext": 1, "primer": 1, "familiar": 1, "build": [1, 8, 9, 15, 17, 20, 34], "local": [1, 8, 22, 24, 25, 34, 35, 36], "preview": 1, "what": [1, 2, 8, 17, 32], "look": [1, 2, 5, 8, 15, 17, 18, 20], "like": [1, 2, 6, 8, 15, 17, 18, 20, 22, 24, 25, 30, 32, 34, 35], "publish": [1, 9, 16], "html": [1, 2, 8, 12, 13, 22, 34, 35, 36], "gener": [1, 9, 11, 12, 13, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "view": [1, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "follow": [1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "python3": [1, 8, 34], "m": [1, 17, 22, 24, 25, 27, 28, 34, 35, 36], "server": [1, 6, 8, 34], "It": [1, 2, 4, 8, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "print": [1, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "serv": [1, 22, 24, 25, 34, 35, 36], "port": [1, 22, 24, 25, 34, 35, 36], "8000": [1, 30], "open": [1, 9, 16, 17, 20], "browser": [1, 4, 6, 22, 24, 25, 34, 35, 36], "go": [1, 15, 17, 20, 22, 24, 25, 34, 35, 36], "read": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "code": [2, 3, 7, 15, 20, 22, 23, 27, 28, 30, 35, 36], "style": [2, 3, 7], "adjust": 2, "sytl": 2, "design": 2, "python": [2, 8, 9, 12, 13, 15, 17, 20, 22, 24, 25, 34, 35, 36], "recommend": [2, 8, 15, 17, 18, 20, 22, 35, 36], "test": [2, 7, 9, 10, 11, 15, 17, 18, 20, 23, 24, 27, 28], "valid": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "dataset": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "lhots": [2, 7, 9, 15, 17, 20], "readthedoc": [2, 8], "io": [2, 8, 12, 13, 22, 34, 35, 36], "en": [2, 8], "latest": [2, 6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "index": [2, 8, 12, 13, 34, 35, 36], "yesno": [2, 7, 8, 19, 30, 37], "veri": [2, 3, 17, 27, 28, 30, 35, 36], "good": 2, "exampl": [2, 6, 7, 9, 11, 12, 13, 23, 27, 28, 30], "speech": [2, 6, 7, 8, 10, 11, 16, 17, 30, 37], "pull": [2, 15, 17, 20, 32], "380": [2, 28], "show": [2, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "add": [2, 15, 17, 18, 35, 37], "new": [2, 3, 6, 8, 15, 16, 17, 18, 20, 22, 23, 24, 25, 30, 34, 35, 36], "suppos": [2, 35, 36], "would": [2, 8, 9, 20, 23, 35, 36], "name": [2, 9, 15, 17, 22, 24, 25, 35, 36], "foo": [2, 11, 13, 15, 20, 22, 24, 25, 34, 35, 36], "eg": [2, 5, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "mkdir": [2, 15, 17, 18, 20, 23, 27, 28, 30, 34], "p": [2, 8, 17, 27, 28], "asr": [2, 5, 7, 8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36, 37], "touch": 2, "sh": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "chmod": 2, "x": [2, 32], "simpl": [2, 17], "own": [2, 22, 35, 36], "otherwis": [2, 15, 17, 20, 22, 24, 25, 34, 35, 36], "librispeech": [2, 5, 7, 9, 11, 12, 13, 19, 20, 22, 23, 24, 25, 31, 32, 34, 35, 36, 37], "assum": [2, 8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35, 36], "fanci": 2, "call": 2, "bar": [2, 11, 13, 15, 20, 22, 24, 25, 34, 35, 36], "organ": 2, "wai": [2, 3, 14, 22, 24, 25, 32, 34, 35, 36], "readm": [2, 15, 17, 18, 20, 23, 27, 28, 30], "md": [2, 5, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "asr_datamodul": [2, 8], "pretrain": [2, 9, 11, 13, 15, 17, 18, 20, 23, 27, 28, 30], "For": [2, 5, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "instanc": [2, 5, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "tdnn": [2, 8, 16, 19, 21, 26, 29], "its": [2, 9, 13, 17, 24], "directori": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "structur": 2, "descript": [2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "contain": [2, 7, 9, 10, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36, 37], "inform": [2, 9, 15, 17, 18, 20, 22, 23, 24, 27, 28, 30, 32, 34, 35, 36], "g": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "wer": [2, 8, 9, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "etc": [2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "provid": [2, 6, 8, 9, 10, 11, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36, 37], "pytorch": [2, 7, 17], "dataload": [2, 8], "take": [2, 9, 22, 30, 35, 36], "input": [2, 9, 15, 17, 18, 20, 23, 27, 28, 30, 32], "checkpoint": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "save": [2, 8, 9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "dure": [2, 6, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "stage": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "": [2, 8, 9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "definit": 2, "neural": [2, 15, 20], "network": [2, 15, 17, 20, 22, 24, 25, 34, 35, 36], "script": [2, 7, 8, 13, 14, 15, 17, 18, 20, 23, 27, 28, 30, 34], "infer": [2, 9, 11], "tdnn_lstm_ctc": [2, 18, 23, 28], "conformer_ctc": [2, 15, 20], "get": [2, 6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 30, 34, 35, 36], "feel": [2, 34], "result": [2, 5, 6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "everi": [2, 9, 22, 24, 25, 34, 35, 36], "kept": [2, 22, 35, 36], "self": [2, 10, 32], "toler": 2, "duplic": 2, "among": [2, 8], "differ": [2, 8, 15, 16, 20, 22, 32, 34, 35, 36], "invoc": 2, "help": [2, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "blob": [2, 5, 9, 13, 22, 24, 25, 34, 35, 36], "master": [2, 5, 9, 11, 12, 13, 17, 22, 24, 25, 34, 35, 36], "transform": [2, 15, 20, 34], "conform": [2, 11, 12, 16, 17, 19, 21, 22, 24, 34, 35, 36], "base": [2, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "lstm": [2, 10, 13, 16, 19, 21, 26, 31, 33], "attent": [2, 17, 18, 32, 35, 36], "lm": [2, 8, 17, 22, 23, 27, 28, 30, 35, 36], "rescor": [2, 18, 23, 25, 27, 28, 30], "demonstr": [2, 4, 6, 9], "consid": 2, "colab": 2, "notebook": 2, "welcom": 3, "There": [3, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "mani": [3, 35, 36], "two": [3, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "them": [3, 4, 5, 6, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "To": [3, 6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "document": [3, 7, 9, 25], "repositori": 3, "recip": [3, 5, 7, 8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 30, 32, 34, 35, 36], "In": [3, 6, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 23, 27, 28, 30, 32], "page": [3, 6, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36, 37], "describ": [3, 4, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 27, 28, 35, 36], "how": [3, 4, 6, 7, 8, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "creat": [3, 7, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "data": [3, 9, 11, 12, 13, 16], "train": [3, 4, 6, 7, 9, 12, 13, 32], "decod": [3, 6, 11, 13, 14], "model": [3, 4, 6, 7, 8, 10, 32], "section": [4, 8, 9, 11, 12, 13, 14, 15, 20], "find": [4, 5, 6, 8, 9, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "also": [4, 5, 8, 9, 10, 11, 13, 15, 17, 18, 20, 22, 24, 25, 30, 32, 34, 35, 36], "try": [4, 6, 22, 24, 25, 34, 35, 36], "from": [4, 6, 8, 9, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "within": [4, 6], "anyth": [4, 6], "space": [4, 7], "youtub": [4, 7, 20, 22, 23, 24, 25, 34, 35, 36], "video": [4, 7, 20, 22, 23, 24, 25, 34, 35, 36], "upload": [5, 6, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "huggingfac": [5, 7, 9, 15, 17, 18, 20, 23, 24, 25, 27, 28, 30, 34], "co": [5, 6, 9, 15, 16, 17, 18, 20, 23, 24, 25, 27, 28, 30, 34], "visit": [5, 6, 22, 24, 25, 34, 35, 36], "link": [5, 8, 9, 10, 22, 24, 25, 34, 35, 36], "search": [5, 6], "specif": [5, 17], "correspond": [5, 6], "aishel": [5, 7, 15, 17, 18, 19, 37], "gigaspeech": [5, 12, 34], "wenetspeech": [5, 12], "integr": 6, "framework": [6, 11, 22, 35], "sherpa": [6, 10, 11, 12, 13, 34], "need": [6, 8, 9, 10, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "download": [6, 7, 16], "window": [6, 10, 11, 34], "maco": [6, 10, 11, 34], "linux": [6, 10, 11, 34], "even": [6, 8], "ipad": 6, "phone": 6, "start": [6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "address": [6, 9, 17, 22, 25, 34, 35, 36], "recognit": [6, 7, 10, 11, 16, 17, 30, 37], "screenshot": [6, 15, 17, 18, 20, 22, 30, 34, 35], "select": [6, 22, 23, 27, 28, 30, 34, 35, 36], "languag": [6, 15, 17, 18], "current": [6, 8, 17, 32, 34, 35, 36, 37], "chines": [6, 16, 17], "english": [6, 30, 34], "target": 6, "method": [6, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35, 36], "greedi": 6, "modified_beam_search": [6, 17, 22, 24, 34, 35, 36], "choos": [6, 8, 22, 24, 25, 34, 35, 36], "number": [6, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "activ": 6, "path": [6, 9, 11, 13, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "either": [6, 15, 17, 18, 20, 35, 36], "record": [6, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "click": [6, 8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "button": 6, "submit": 6, "wait": 6, "moment": 6, "an": [6, 8, 9, 11, 12, 13, 15, 16, 17, 20, 22, 25, 30, 34, 35, 36], "when": [6, 14, 17, 20, 22, 24, 25, 35, 36], "bottom": [6, 22, 24, 25, 34, 35, 36], "part": [6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "tabl": 6, "one": [6, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "next": [6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "gen": [6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "kaldi": [6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "subscrib": [6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "channel": [6, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "nadira": [6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "povei": [6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "www": [6, 8, 16, 20, 22, 23, 24, 25, 34, 35, 36], "uc_vaumpkminz1pnkfxan9mw": [6, 8, 20, 22, 23, 24, 25, 34, 35, 36], "torchaudio": [7, 32], "2": [7, 9, 11, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "export": [7, 8, 15, 17, 18, 20, 23, 27, 28, 30], "state_dict": [7, 14, 15, 17, 18, 20, 23, 27, 28, 30], "torch": [7, 8, 9, 14, 15, 17, 20], "jit": [7, 14, 20], "trace": [7, 12, 14], "onnx": [7, 9, 14], "ncnn": [7, 14], "non": [7, 11, 20, 32, 35, 37], "stream": [7, 11, 15, 20, 27, 28, 34, 37], "timit": [7, 19, 27, 28, 37], "introduct": [7, 31, 37], "contribut": 7, "depend": [8, 15, 20, 34], "step": [8, 9, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "order": [8, 15, 18, 20, 23, 27, 28], "matter": 8, "org": [8, 16, 17, 22, 34, 35, 36], "least": 8, "v1": [8, 15, 18, 20, 23, 27, 28], "9": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 30, 34, 35, 36], "alreadi": [8, 9], "don": [8, 12, 15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "t": [8, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "replac": 8, "compil": [8, 15, 17, 20], "against": 8, "strongli": 8, "collect": 8, "variabl": [8, 15, 18, 20, 22, 24, 25, 34, 35, 36], "pythonpath": 8, "point": [8, 9, 15, 18, 20, 22, 24, 25, 34, 35, 36], "folder": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "tmp": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "setup": [8, 15, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35, 36], "put": [8, 24, 35], "sever": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "same": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "switch": [8, 15, 20, 25], "just": [8, 32], "about": [8, 17, 22, 25, 34, 35, 36], "virtualenv": 8, "8": [8, 9, 15, 17, 20, 22, 23, 24, 25, 30, 34, 35, 36], "cpython3": 8, "6": [8, 15, 17, 20, 22, 23, 27, 28, 34], "final": [8, 9, 20, 23], "64": [8, 9, 17, 35], "1540m": 8, "creator": 8, "cpython3posix": 8, "dest": 8, "ceph": [8, 9, 15, 17, 20], "fj": [8, 9, 17, 20], "fangjun": [8, 9, 17, 20], "clear": 8, "fals": [8, 9, 15, 17, 20], "no_vcs_ignor": 8, "global": 8, "seeder": 8, "fromappdata": 8, "bundl": 8, "setuptool": 8, "wheel": 8, "via": [8, 12, 13], "copi": [8, 32], "app_data_dir": 8, "root": 8, "share": 8, "v": [8, 20, 27, 28], "irtualenv": 8, "ad": [8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "seed": 8, "packag": 8, "21": [8, 9, 15, 17, 20, 27, 28], "57": [8, 20, 23], "36": [8, 17, 20], "bashactiv": 8, "cshellactiv": 8, "fishactiv": 8, "powershellactiv": 8, "pythonactiv": 8, "xonshactiv": 8, "sourc": [8, 9, 15, 16, 17, 20], "bin": [8, 15, 20, 34], "dev20210822": 8, "cpu": [8, 9, 12, 15, 22, 24, 25, 30, 35, 36], "torch1": 8, "f": [8, 27, 28], "nightli": 8, "whl": [8, 34], "2bcpu": 8, "cp38": 8, "linux_x86_64": 8, "mb": 8, "________________________________": 8, "185": [8, 15, 20, 30], "kb": [8, 27, 28], "graphviz": 8, "17": [8, 9, 15, 20, 27, 28, 34], "py3": 8, "none": [8, 15, 20], "18": [8, 15, 17, 18, 20, 22, 23, 27, 28, 34, 35, 36], "cach": 8, "manylinux1_x86_64": 8, "831": [8, 17, 28], "type": [8, 9, 15, 17, 20, 22, 24, 25, 30, 32, 34, 35, 36], "extens": 8, "typing_extens": 8, "26": [8, 17, 20, 28], "successfulli": 8, "probabl": [8, 17, 22, 24, 34, 35, 36], "cuda": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35, 36], "req": 8, "7b1b76ge": 8, "q": 8, "audioread": 8, "soundfil": 8, "post1": 8, "py2": 8, "7": [8, 9, 15, 18, 20, 22, 23, 27, 28, 34, 35], "97": [8, 15], "cytoolz": 8, "11": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "manylinux_2_17_x86_64": 8, "manylinux2014_x86_64": 8, "dataclass": 8, "14": [8, 9, 11, 12, 15, 20, 22, 23, 24, 27, 34, 35, 36], "h5py": 8, "manylinux_2_12_x86_64": 8, "manylinux2010_x86_64": 8, "684": [8, 15, 30], "intervaltre": 8, "lilcom": 8, "numpi": 8, "15": [8, 9, 17, 18, 20, 27, 30], "40": [8, 18, 20, 23, 27, 28], "pyyaml": 8, "662": 8, "tqdm": 8, "62": [8, 20, 23], "76": [8, 30], "73": 8, "satisfi": 8, "lib": 8, "site": 8, "dev": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "2a1410b": 8, "clean": [8, 15, 17, 20, 22, 23, 24, 25, 34, 35, 36], "toolz": 8, "55": [8, 18, 20, 27], "sortedcontain": 8, "29": [8, 15, 17, 18, 20, 23, 27, 28], "cffi": 8, "411": [8, 20], "pycpars": 8, "20": [8, 9, 15, 17, 18, 20, 22, 23, 27, 28, 30, 35], "112": 8, "pypars": 8, "67": 8, "done": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "filenam": [8, 11, 12, 13, 24, 25, 34, 36], "dev_2a1410b_clean": 8, "size": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "342242": 8, "sha256": 8, "f683444afa4dc0881133206b4646a": 8, "9d0f774224cc84000f55d0a67f6e4a37997": 8, "store": [8, 20], "ephem": 8, "ftu0qysz": 8, "7f": 8, "7a": 8, "8e": 8, "a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f": 8, "warn": 8, "built": 8, "invalid": [8, 20], "metadata": [8, 27, 28], "mandat": 8, "pep": 8, "440": 8, "packa": 8, "ging": 8, "deprec": [8, 17], "legaci": 8, "becaus": 8, "could": [8, 15, 18], "A": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 34, 35, 36], "discuss": 8, "regard": 8, "pypa": 8, "sue": 8, "8368": 8, "inter": 8, "valtre": 8, "sor": 8, "tedcontain": 8, "remot": 8, "enumer": 8, "object": [8, 15, 17, 18, 22, 30, 34, 35], "500": [8, 9, 17, 20, 25, 34], "count": 8, "100": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "compress": 8, "308": [8, 15, 17, 18], "total": [8, 15, 17, 18, 20, 22, 23, 30, 34, 35], "delta": 8, "263": 8, "reus": 8, "307": 8, "102": [8, 15], "pack": [8, 35, 36], "receiv": 8, "172": 8, "49": [8, 20, 28, 30], "kib": 8, "385": 8, "00": [8, 15, 17, 18, 20, 23, 27, 28, 30], "resolv": 8, "kaldilm": 8, "tar": 8, "gz": 8, "48": [8, 15, 17], "574": 8, "kaldialign": 8, "sentencepiec": [8, 20], "96": 8, "tensorboard": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "41": [8, 15, 17, 27, 30], "line": [8, 22, 35, 36], "absl": 8, "absl_pi": 8, "13": [8, 9, 17, 18, 20, 23, 24, 27], "132": 8, "googl": [8, 22, 24, 25, 34, 35, 36], "auth": 8, "oauthlib": 8, "google_auth_oauthlib": 8, "grpcio": 8, "24": [8, 18, 23, 27, 28, 30], "39": [8, 17, 20, 23, 27], "ment": 8, "12": [8, 9, 15, 17, 18, 20, 22, 24, 25, 27, 30, 34, 35, 36], "requi": 8, "rement": 8, "protobuf": 8, "manylinux_2_5_x86_64": 8, "werkzeug": 8, "288": 8, "tensorboard_data_serv": 8, "google_auth": 8, "35": [8, 9, 17, 20, 34], "152": 8, "request": [8, 32], "plugin": 8, "wit": 8, "tensorboard_plugin_wit": 8, "781": 8, "markdown": 8, "six": 8, "16": [8, 9, 13, 15, 17, 18, 20, 22, 23, 27, 28, 30, 34, 35, 36], "cachetool": 8, "rsa": 8, "34": 8, "pyasn1": 8, "modul": [8, 24, 35], "pyasn1_modul": 8, "155": 8, "requests_oauthlib": 8, "23": [8, 15, 17, 18, 20, 27, 28, 30], "77": [8, 20], "urllib3": 8, "27": [8, 15, 17, 23, 28], "138": [8, 15, 17], "certifi": 8, "2017": 8, "2021": [8, 15, 18, 20, 23, 27, 28, 30], "30": [8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "145": 8, "charset": 8, "normal": [8, 23, 27, 28, 30, 35], "charset_norm": 8, "idna": 8, "59": [8, 18, 20], "146": 8, "897233": 8, "eccb906cafcd45bf9a7e1a1718e4534254bfb": 8, "f4c0d0cbc66eee6c88d68a63862": 8, "85": 8, "7d": 8, "63": [8, 17], "f2dd586369b8797cb36d213bf3a84a789eeb92db93d2e723c9": 8, "etool": 8, "oaut": 8, "hlib": 8, "let": [8, 15, 20, 34], "u": [8, 15, 17, 18, 20, 30, 34], "log": [8, 23, 27, 28, 30], "08": [8, 20, 23, 25, 27, 28, 30, 34], "19": [8, 9, 15, 20, 23, 27, 28], "main": [8, 15, 20, 32], "dl_dir": [8, 15, 18, 20, 22, 24, 25, 34, 35, 36], "waves_yesno": 8, "49mb": 8, "03": [8, 9, 17, 20, 27, 28, 34], "39mb": 8, "manifest": 8, "31": [8, 20], "42": [8, 15, 20, 30], "comput": [8, 9, 15, 17, 18, 22, 23, 25, 27, 28, 30, 34, 35, 36], "fbank": [8, 9, 15, 17, 18, 20, 23, 27, 28, 30], "32": [8, 15, 17, 18, 36], "803": 8, "info": [8, 9, 15, 17, 18, 20, 23, 27, 28, 30], "compute_fbank_yesno": 8, "52": [8, 15, 20], "process": [8, 9, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "extract": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "featur": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "_______________________________________________________________": 8, "90": 8, "01": [8, 17, 18, 20], "80": [8, 9, 15, 17, 20], "57it": 8, "085": 8, "______________________________________________________________": 8, "248": [8, 17], "21it": 8, "lang": [8, 9, 17, 20, 25], "fcordre9": 8, "kaldilm_6899d26f2d684ad48f21025950cd2866": 8, "csrc": [8, 20], "arpa_file_pars": 8, "cc": 8, "void": 8, "arpafilepars": 8, "rea": 8, "d": [8, 27, 28], "std": 8, "istream": 8, "79": 8, "140": [8, 18], "gram": [8, 15, 17, 18, 22, 23, 25, 27, 28, 35, 36], "89": [8, 15], "hlg": [8, 23, 27, 28, 30], "928": 8, "compile_hlg": 8, "120": 8, "lang_phon": [8, 18, 23, 27, 28, 30], "929": [8, 17], "lexicon": [8, 15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "116": 8, "convert": [8, 20, 34], "l": [8, 17, 27, 28, 30, 34], "pt": [8, 9, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "linv": [8, 17, 20, 30], "931": 8, "ctc_topo": 8, "max_token_id": 8, "932": 8, "load": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "fst": [8, 17, 30], "intersect": [8, 22, 35, 36], "933": 8, "lg": [8, 22, 25, 35, 36], "shape": 8, "66": 8, "connect": [8, 9, 20, 22, 23, 34, 35, 36], "68": [8, 20], "70": 8, "class": [8, 20], "tensor": [8, 15, 17, 18, 20, 22, 30, 34, 35], "71": [8, 20, 23], "determin": 8, "934": 8, "74": [8, 9], "_k2": 8, "raggedint": 8, "remov": [8, 15, 17, 18, 20, 23, 27, 28], "disambigu": 8, "symbol": [8, 17, 22, 35, 36], "87": 8, "remove_epsilon": 8, "935": 8, "92": [8, 20], "arc": 8, "95": [8, 16], "compos": 8, "h": 8, "105": [8, 20], "936": 8, "107": [8, 23], "123": 8, "now": [8, 15, 20, 22, 23, 24, 25, 27, 28, 34, 35, 36], "cuda_visible_devic": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "gpu": [8, 15, 17, 18, 20, 22, 24, 25, 27, 28, 30, 34, 35, 36], "avail": [8, 9, 15, 17, 20, 23, 27, 28, 30, 34], "case": [8, 9, 22, 24, 25, 34, 35, 36], "segment": 8, "fault": 8, "core": 8, "dump": 8, "error": [8, 20], "protocol_buffers_python_implement": 8, "more": [8, 15, 20, 30, 32, 34, 35], "674": 8, "interest": [8, 22, 24, 25, 34, 35, 36], "given": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 35, 36], "below": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35], "072": 8, "465": 8, "466": 8, "exp_dir": [8, 17, 20, 22, 24, 25, 35, 36], "posixpath": [8, 17, 20], "exp": [8, 9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "lang_dir": [8, 17, 20], "lr": [8, 17, 34], "feature_dim": [8, 9, 15, 17, 20, 30], "weight_decai": 8, "1e": 8, "06": [8, 9, 18, 20, 23, 30], "start_epoch": 8, "best_train_loss": [8, 9], "inf": [8, 9], "best_valid_loss": [8, 9], "best_train_epoch": [8, 9], "best_valid_epoch": [8, 9], "batch_idx_train": [8, 9], "log_interv": [8, 9], "valid_interv": [8, 9], "beam_siz": [8, 9, 17], "reduct": [8, 24], "sum": 8, "use_doub": 8, "le_scor": 8, "true": [8, 9, 15, 17, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "world_siz": 8, "master_port": 8, "12354": 8, "num_epoch": 8, "feature_dir": [8, 20], "max_dur": [8, 20], "bucketing_sampl": [8, 20], "num_bucket": [8, 20], "concatenate_cut": [8, 20], "duration_factor": [8, 20], "gap": [8, 20], "on_the_fly_feat": [8, 20], "shuffl": [8, 20], "return_cut": [8, 20], "num_work": [8, 20], "074": 8, "113": [8, 17, 20], "098": [8, 23], "cut": [8, 20], "240": [8, 15, 30], "149": [8, 20], "200": [8, 9, 15, 20, 27, 28, 30], "singlecutsampl": 8, "206": [8, 20], "219": [8, 17, 20], "246": [8, 17, 20, 27, 28], "357": 8, "416": 8, "epoch": [8, 9, 11, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "batch": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "avg": [8, 9, 11, 12, 13, 17, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "loss": [8, 15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "0789": 8, "848": 8, "5356": 8, "7556": 8, "301": [8, 9, 20], "432": [8, 20], "9972": 8, "best": [8, 15, 18, 20], "805": 8, "2436": 8, "5717": 8, "33": [8, 15, 16, 17, 20, 27], "109": [8, 15, 20], "4167": 8, "121": [8, 23], "325": 8, "2214": 8, "798": [8, 17], "0781": 8, "1343": 8, "065": 8, "0859": 8, "556": 8, "0421": 8, "0975": 8, "810": 8, "0431": 8, "824": 8, "657": 8, "0109": 8, "984": [8, 20], "0093": 8, "0096": 8, "50": [8, 9, 20, 22, 27, 34, 35, 36], "239": [8, 17], "0104": 8, "0101": 8, "569": 8, "0092": 8, "819": [8, 27], "835": 8, "51": [8, 15, 20, 30], "024": 8, "0105": 8, "317": 8, "0099": 8, "0097": 8, "552": 8, "0108": 8, "869": 8, "0102": 8, "126": [8, 20], "128": [8, 20], "537": [8, 20], "192": [8, 20], "249": 8, "250": [8, 17, 23], "lm_dir": [8, 20], "search_beam": [8, 15, 20, 30], "output_beam": [8, 15, 20, 30], "min_active_st": [8, 15, 20, 30], "max_active_st": [8, 15, 20, 30], "10000": [8, 15, 20, 30], "use_double_scor": [8, 15, 20, 30], "193": 8, "213": [8, 30], "259": [8, 15], "devic": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 35, 36], "217": [8, 15, 20], "279": [8, 20], "averag": [8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "userwarn": [8, 17], "floor_divid": 8, "futur": [8, 17, 37], "round": [8, 17], "toward": [8, 17], "trunc": [8, 17], "function": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "NOT": [8, 15, 17, 20, 30], "floor": [8, 17], "incorrect": [8, 17], "neg": [8, 17], "valu": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "keep": [8, 17, 22, 35, 36], "behavior": [8, 17], "div": [8, 17], "b": [8, 17, 20, 27, 28], "rounding_mod": [8, 17], "actual": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "divis": [8, 17], "trigger": 8, "intern": 8, "aten": 8, "src": [8, 34], "nativ": 8, "binaryop": 8, "cpp": [8, 12], "450": [8, 15, 17, 18], "k": [8, 22, 27, 28, 34, 35, 36], "n": [8, 15, 22, 24, 25, 27, 28, 34, 35, 36], "220": [8, 17, 18, 20], "409": 8, "190": [8, 23], "until": [8, 20, 24], "571": [8, 20], "228": [8, 20], "transcript": [8, 15, 16, 17, 18, 20, 22, 23, 27, 28, 34, 35, 36], "recog": [8, 17, 20], "test_set": [8, 30], "572": 8, "util": [8, 20], "ins": [8, 20, 30], "del": [8, 20, 30], "sub": [8, 20, 30], "573": 8, "236": 8, "wrote": [8, 20], "detail": [8, 10, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "stat": [8, 20], "err": [8, 17, 20], "299": 8, "congratul": [8, 15, 18, 20, 23, 27, 28, 30], "first": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "fun": 8, "debug": 8, "variou": [8, 14, 37], "problem": 8, "mai": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36, 37], "encount": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "while": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "period": 9, "disk": 9, "optim": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "other": [9, 17, 20, 22, 23, 27, 28, 30, 32, 35, 36, 37], "relat": [9, 15, 17, 20, 23, 27, 28, 30], "resum": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "howev": 9, "onli": [9, 11, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36, 37], "strip": 9, "except": 9, "reduc": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "each": [9, 15, 17, 18, 20, 22, 24, 25, 32, 34, 35, 36], "well": [9, 30, 37], "usag": [9, 12, 13, 23, 27, 28, 30], "pruned_transducer_stateless3": [9, 11, 12, 32], "almost": [9, 22, 32, 35, 36], "dir": [9, 11, 12, 13, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "bpe": [9, 11, 12, 13, 20, 22, 24, 25, 34, 35, 36], "lang_bpe_500": [9, 11, 12, 13, 20, 22, 24, 25, 34, 35, 36], "dict": 9, "host": 9, "csukuangfj": [9, 15, 17, 18, 20, 23, 27, 28, 30, 34], "prune": [9, 17, 19, 21, 31, 32, 33, 34, 36], "transduc": [9, 10, 11, 16, 19, 21, 31, 32, 33], "stateless3": 9, "2022": [9, 17, 22, 24, 25, 34, 35], "05": [9, 15, 17, 18, 20, 28], "lf": [9, 15, 17, 18, 20, 23, 25, 27, 28, 30], "repo": 9, "prefix": 9, "those": 9, "xxx": 9, "wave": [9, 11, 15, 20], "iter": [9, 13, 22, 24, 25, 34, 35, 36], "1224000": 9, "greedy_search": [9, 17, 22, 24, 34, 35, 36], "test_wav": [9, 15, 17, 18, 20, 23, 27, 28, 30], "1089": [9, 20, 23], "134686": [9, 20, 23], "0001": [9, 20, 23], "wav": [9, 11, 13, 15, 17, 18, 20, 22, 24, 25, 27, 28, 30, 34, 35, 36], "1221": [9, 20, 23], "135766": [9, 20, 23], "0002": [9, 20, 23], "multipl": [9, 15, 17, 18, 20, 23, 27, 28, 30], "sound": [9, 13, 15, 17, 18, 20, 23, 27, 28, 30], "Its": [9, 20], "output": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "09": [9, 15, 17, 18, 20, 34], "02": [9, 17, 20, 22, 28, 34, 35], "233": 9, "265": 9, "reset_interv": 9, "3000": 9, "subsampling_factor": [9, 15, 17, 20], "encoder_dim": 9, "512": [9, 15, 17, 20], "nhead": [9, 15, 17, 20, 22, 35], "dim_feedforward": [9, 17], "2048": [9, 17], "num_encoder_lay": [9, 17], "decoder_dim": 9, "joiner_dim": 9, "model_warm_step": 9, "env_info": [9, 15, 17, 20], "releas": [9, 15, 17, 20], "sha1": [9, 15, 17, 20], "4810e00d8738f1a21278b0156a42ff396a2d40ac": 9, "date": [9, 15, 17, 20], "fri": 9, "oct": [9, 20], "miss": [9, 17, 20], "cu102": 9, "branch": [9, 15, 17, 20, 24], "1013": 9, "c39cba5": 9, "dirti": [9, 15, 20], "thu": [9, 17, 20, 23], "__init__": [9, 15, 17, 20], "jsonl": 9, "hostnam": [9, 17], "de": [9, 17], "74279": [9, 17], "0324160024": 9, "65bfd8b584": 9, "jjlbn": 9, "ip": [9, 17], "177": [9, 17, 18, 20], "203": [9, 20], "bpe_model": [9, 20], "sound_fil": [9, 15, 17, 20, 30], "sample_r": [9, 15, 17, 20, 30], "16000": [9, 15, 17, 18, 20, 23, 24, 27, 28], "beam": [9, 34], "max_context": 9, "max_stat": 9, "context_s": [9, 17], "max_sym_per_fram": [9, 17], "simulate_stream": 9, "decode_chunk_s": 9, "left_context": 9, "dynamic_chunk_train": 9, "causal_convolut": 9, "short_chunk_s": [9, 35, 36], "25": [9, 15, 20, 22, 27, 28, 30, 35], "num_left_chunk": 9, "blank_id": [9, 17], "unk_id": 9, "vocab_s": [9, 17], "271": 9, "273": [9, 17], "612": 9, "458": 9, "disabl": 9, "giga": [9, 34], "623": 9, "277": 9, "paramet": [9, 12, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35, 36], "78648040": 9, "951": [9, 20], "285": [9, 17, 20], "construct": [9, 15, 17, 18, 20, 23, 27, 28, 30], "952": 9, "295": [9, 15, 17, 18, 20], "957": 9, "700": 9, "329": [9, 20], "912": 9, "388": 9, "earli": [9, 20, 23], "nightfal": [9, 20, 23], "THE": [9, 20, 23], "yellow": [9, 20, 23], "lamp": [9, 20, 23], "light": [9, 20, 23], "here": [9, 15, 17, 18, 20, 23, 32, 35], "AND": [9, 20, 23], "THERE": [9, 20, 23], "squalid": [9, 20, 23], "quarter": [9, 20, 23], "OF": [9, 20, 23], "brothel": [9, 20, 23], "god": [9, 20, 23], "AS": [9, 20, 23], "direct": [9, 20, 23], "consequ": [9, 20, 23], "sin": [9, 20, 23], "man": [9, 20, 23], "punish": [9, 20, 23], "had": [9, 20, 23], "her": [9, 20, 23], "love": [9, 20, 23], "child": [9, 20, 23], "whose": [9, 17, 20, 23], "ON": [9, 20, 23], "THAT": [9, 20, 23], "dishonor": [9, 20, 23], "bosom": [9, 20, 23], "TO": [9, 20, 23], "parent": [9, 20, 23], "forev": [9, 20, 23], "WITH": [9, 20, 23], "race": [9, 20, 23], "descent": [9, 20, 23], "mortal": [9, 20, 23], "BE": [9, 20, 23], "bless": [9, 20, 23], "soul": [9, 20, 23], "IN": [9, 20, 23], "heaven": [9, 20, 23], "yet": [9, 20, 23], "THESE": [9, 20, 23], "thought": [9, 20, 23], "affect": [9, 20, 23], "hester": [9, 20, 23], "prynn": [9, 20, 23], "less": [9, 20, 23, 30, 35, 36], "hope": [9, 16, 20, 23], "than": [9, 15, 17, 18, 20, 22, 23, 24, 25, 30, 34, 35, 36], "apprehens": [9, 20, 23], "390": 9, "alwai": 9, "note": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "down": [9, 15, 20, 22, 24, 25, 34, 35, 36], "reproduc": [9, 20], "ln": [9, 15, 20, 22, 24, 25, 34, 35, 36], "9999": [9, 24, 25, 34], "symlink": 9, "pass": [9, 15, 17, 18, 20, 22, 24, 25, 32, 34, 35, 36], "max": [9, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "durat": [9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "600": [9, 20, 22, 24, 34, 35, 36], "reason": [9, 35], "support": [10, 11, 15, 17, 20, 22, 24, 25, 32, 34, 35, 36], "perform": [10, 11, 17, 35], "raspberri": [10, 34], "pi": [10, 34], "project": 10, "static": [10, 34], "produc": [10, 22, 24, 25, 34, 35, 36], "binari": [10, 15, 17, 18, 20, 22, 30, 34, 35], "everyth": 10, "tree": [11, 12, 13, 15, 17, 18, 20, 23, 27, 28, 30, 34], "insid": [11, 13], "encod": [11, 13, 15, 17, 18, 20, 22, 23, 24, 30, 32, 34, 35, 36], "joiner": [11, 13, 17, 22, 34, 35, 36], "joiner_encoder_proj": 11, "joiner_decoder_proj": 11, "onnx_pretrain": 11, "proj": 11, "baz": [11, 13], "onnxruntim": 11, "our": [12, 13, 20, 22, 32, 35, 36], "torchscript": [12, 13, 34], "cpu_jit": [12, 15, 20, 22, 24, 25, 35, 36], "confus": 12, "move": [12, 22, 24, 25, 35, 36], "why": 12, "streaming_asr": [12, 13, 34, 35, 36], "emform": 12, "conv_emform": 12, "offline_asr": [12, 22], "lstm_transducer_stateless2": [13, 34], "468000": [13, 34], "three": [13, 15, 17, 32], "encoder_jit_trac": [13, 34, 36], "decoder_jit_trac": [13, 34, 36], "joiner_jit_trac": [13, 34, 36], "jit_pretrain": [13, 24, 25, 34], "tutori": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35, 36], "learn": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "singl": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "1best": [15, 18, 20, 23, 24, 25, 27, 28], "handl": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "automag": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "stop": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "control": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "By": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "default": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "execut": [15, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "mean": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 32, 34, 35, 36], "musan": [15, 18, 20, 22, 24, 25, 34, 35, 36], "sai": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "thei": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "won": [15, 18, 20, 22, 24, 25, 34, 35, 36], "re": [15, 18, 20, 22, 24, 25, 34, 35, 36], "intal": [15, 18], "initi": [15, 18], "sudo": [15, 18], "apt": [15, 18], "permiss": [15, 18], "commandlin": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "quit": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "often": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "experi": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "num": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "state": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "world": [15, 17, 18, 20, 22, 23, 24, 25, 34, 35, 36], "multi": [15, 17, 18, 20, 22, 24, 25, 32, 34, 35, 36], "machin": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "ddp": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "implement": [15, 17, 18, 20, 22, 24, 25, 32, 34, 35, 36], "present": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "later": [15, 18, 20, 22, 23, 24, 25, 27, 28, 34, 35, 36], "specifi": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "second": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "over": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "utter": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "pad": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "oom": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "v100": [15, 17, 18, 20], "nvidia": [15, 17, 18, 20], "due": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "usual": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "larger": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "caus": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "smaller": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "increas": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "tune": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "weight": [15, 18, 20, 24, 25, 34], "decai": [15, 18, 20, 24, 25, 34], "warmup": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "get_param": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "realli": [15, 18, 20, 22, 24, 25, 34, 35, 36], "directli": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "perturb": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "speed": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "factor": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "3x150": [15, 17, 18], "hour": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "These": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "rate": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "visual": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "logdir": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "labelsmooth": 15, "someth": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "tensorflow": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "found": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "continu": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "press": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "ctrl": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35, 36], "engw8ksktzqs24zbv5dgcg": 15, "22t11": 15, "scan": [15, 17, 18, 20, 22, 30, 34, 35], "116068": 15, "scalar": [15, 17, 18, 20, 22, 30, 34, 35], "listen": [15, 17, 18, 22, 30, 34, 35], "url": [15, 17, 18, 20, 22, 24, 25, 30, 34, 35], "xxxx": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "text": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "saw": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "consol": [15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "typic": [15, 17, 18, 20], "avoid": [15, 17, 20], "commonli": [15, 17, 18, 20, 23, 27, 28, 30], "nbest": [15, 20, 25], "scale": [15, 20, 23, 25, 27, 28], "lattic": [15, 18, 20, 22, 23, 27, 28, 35, 36], "score": [15, 20, 22, 35, 36], "uniqu": [15, 20, 22, 35, 36], "pkufool": [15, 18, 23], "icefall_asr_aishell_conformer_ctc": 15, "transcrib": [15, 17, 18, 20], "lang_char": [15, 17], "token": [15, 17, 18, 20, 23, 27, 28, 30], "word": [15, 17, 18, 20, 23, 27, 28, 30], "bac009s0764w0121": [15, 17, 18], "bac009s0764w0122": [15, 17, 18], "bac009s0764w0123": [15, 17, 18], "tran": [15, 18, 20, 23, 27, 28], "graph": [15, 18, 20, 22, 23, 27, 28, 35, 36], "id": [15, 18, 20, 23, 27, 28], "conveni": [15, 18, 20], "eo": [15, 18, 20], "easili": [15, 18, 20], "obtain": [15, 17, 18, 20, 23, 27, 28], "84": 15, "list": [15, 17, 18, 20, 23, 27, 28], "soxi": [15, 17, 18, 20, 23, 30], "sampl": [15, 17, 18, 20, 23, 24, 30, 35, 36], "precis": [15, 17, 18, 20, 22, 23, 30, 35, 36], "bit": [15, 17, 18, 20, 23, 30], "04": [15, 17, 18, 20, 23, 27, 28], "67263": [15, 17, 18], "315": [15, 17, 18, 20, 23], "cdda": [15, 17, 18, 20, 23, 30], "sector": [15, 17, 18, 20, 23, 30], "135k": [15, 17, 18], "256k": [15, 17, 18, 20], "sign": [15, 17, 18, 20, 30], "integ": [15, 17, 18, 20, 30], "pcm": [15, 17, 18, 20, 30], "65840": [15, 17, 18], "625": [15, 17, 18], "132k": [15, 17, 18], "64000": [15, 17, 18], "300": [15, 17, 18, 20, 22, 35], "128k": [15, 17, 18, 30], "displai": [15, 17, 18, 20], "topologi": [15, 20], "07": [15, 17, 18, 20], "53": [15, 22, 23, 28, 34, 35], "707": [15, 20], "229": 15, "attention_dim": [15, 17, 20], "num_decoder_lay": [15, 20], "vgg_frontend": [15, 17, 20], "use_feat_batchnorm": [15, 20], "f2fd997f752ed11bbef4c306652c433e83f9cf12": 15, "sun": 15, "sep": 15, "46": [15, 20], "33cfe45": 15, "d57a873": 15, "wed": [15, 17, 20], "nov": [15, 20], "hw": 15, "kangwei": 15, "icefall_aishell3": 15, "k2_releas": 15, "tokens_fil": 15, "words_fil": [15, 20, 30], "num_path": [15, 20, 22, 35, 36], "ngram_lm_scal": [15, 20], "attention_decoder_scal": [15, 20], "nbest_scal": [15, 20], "sos_id": [15, 20], "eos_id": [15, 20], "num_class": [15, 20, 30], "4336": [15, 17], "708": [15, 17, 20, 30], "242": [15, 20], "131": [15, 20], "134": 15, "269": [15, 27, 28], "275": 15, "241": 15, "293": [15, 20], "704": [15, 27], "369": [15, 20], "\u751a": [15, 17], "\u81f3": [15, 17], "\u51fa": [15, 17], "\u73b0": [15, 17], "\u4ea4": [15, 17], "\u6613": [15, 17], "\u51e0": [15, 17], "\u4e4e": [15, 17], "\u505c": [15, 17], "\u6b62": 15, "\u7684": [15, 17, 18], "\u60c5": [15, 17], "\u51b5": [15, 17], "\u4e00": [15, 17], "\u4e8c": [15, 17], "\u7ebf": [15, 17, 18], "\u57ce": [15, 17], "\u5e02": [15, 17], "\u867d": [15, 17], "\u7136": [15, 17], "\u4e5f": [15, 17, 18], "\u5904": [15, 17], "\u4e8e": [15, 17], "\u8c03": [15, 17], "\u6574": [15, 17], "\u4e2d": [15, 17, 18], "\u4f46": [15, 17, 18], "\u56e0": [15, 17], "\u4e3a": [15, 17], "\u805a": [15, 17], "\u96c6": [15, 17], "\u4e86": [15, 17, 18], "\u8fc7": [15, 17], "\u591a": [15, 17], "\u516c": [15, 17], "\u5171": [15, 17], "\u8d44": [15, 17], "\u6e90": [15, 17], "371": 15, "37": [15, 17, 20, 27], "38": [15, 17, 20, 27], "683": 15, "47": [15, 20], "651": [15, 30], "654": 15, "659": 15, "752": 15, "321": 15, "887": 15, "340": 15, "370": 15, "\u751a\u81f3": [15, 18], "\u51fa\u73b0": [15, 18], "\u4ea4\u6613": [15, 18], "\u51e0\u4e4e": [15, 18], "\u505c\u6b62": 15, "\u60c5\u51b5": [15, 18], "\u4e00\u4e8c": [15, 18], "\u57ce\u5e02": [15, 18], "\u867d\u7136": [15, 18], "\u5904\u4e8e": [15, 18], "\u8c03\u6574": [15, 18], "\u56e0\u4e3a": [15, 18], "\u805a\u96c6": [15, 18], "\u8fc7\u591a": [15, 18], "\u516c\u5171": [15, 18], "\u8d44\u6e90": [15, 18], "372": 15, "recor": [15, 20], "highest": [15, 20], "965": 15, "966": 15, "821": 15, "822": 15, "826": 15, "916": 15, "115": [15, 20], "345": 15, "888": 15, "889": 15, "limit": [15, 17, 20, 32, 35], "memori": [15, 17, 20, 32], "upgrad": [15, 20], "pro": [15, 20], "finish": [15, 17, 18, 20, 22, 23, 27, 28, 30, 35, 36], "deploi": [15, 20], "At": [15, 20], "doe": [15, 17, 20, 30], "home": [15, 20], "checkout": [15, 20], "v2": [15, 20], "cmake": [15, 20], "dcmake_build_typ": [15, 20], "j": [15, 20], "hlg_decod": [15, 20], "four": [15, 20], "messag": [15, 20, 22, 24, 25, 34, 35, 36], "nn_model": [15, 20], "use_gpu": [15, 20], "word_tabl": [15, 20], "caution": [15, 20], "forward": [15, 20, 24], "cu": [15, 20], "int": [15, 20], "char": [15, 20], "124": [15, 20], "98": 15, "142": [15, 18, 20], "150": [15, 20], "693": [15, 27], "165": [15, 20], "nnet_output": [15, 20], "182": [15, 23], "180": [15, 20], "489": 15, "45": [15, 17, 20], "216": [15, 20, 27, 28], "mandarin": 16, "corpu": 16, "beij": 16, "shell": 16, "technologi": 16, "ltd": 16, "400": 16, "peopl": 16, "accent": 16, "area": 16, "china": 16, "invit": 16, "particip": 16, "conduct": 16, "quiet": 16, "indoor": 16, "high": 16, "fidel": 16, "microphon": 16, "downsampl": 16, "16khz": 16, "manual": 16, "accuraci": 16, "through": 16, "profession": 16, "annot": 16, "strict": 16, "inspect": 16, "free": [16, 34], "academ": 16, "moder": 16, "amount": 16, "research": 16, "field": 16, "openslr": 16, "ctc": [16, 19, 21, 25, 26, 29], "stateless": [16, 19, 22, 34, 35, 36], "instead": [17, 35], "rnn": [17, 22, 24, 34, 35, 36], "As": [17, 20], "head": [17, 32], "dim": [17, 22, 35], "layer": [17, 22, 32, 34, 35, 36], "feedforward": [17, 22, 35], "embed": [17, 22, 34, 35, 36], "conv1d": [17, 22, 34, 35, 36], "kernel": 17, "left": [17, 35, 36], "context": [17, 22, 32, 34, 35, 36], "nn": [17, 22, 24, 25, 34, 35, 36], "tanh": 17, "linear": 17, "borrow": 17, "ieeexplor": 17, "ieee": 17, "stamp": 17, "jsp": 17, "arnumb": 17, "9054419": 17, "predict": [17, 22, 34, 35, 36], "modif": 17, "right": [17, 32, 35], "charact": 17, "unit": 17, "vocabulari": 17, "87939824": 17, "88": 17, "optimized_transduc": 17, "extra": [17, 32, 35], "technqiu": 17, "propos": [17, 32, 36], "improv": 17, "end": [17, 22, 24, 25, 30, 34, 35, 36], "furthermor": 17, "maximum": 17, "emit": 17, "per": [17, 22, 35, 36], "frame": [17, 22, 24, 35, 36], "simplifi": [17, 32], "significantli": 17, "degrad": 17, "exactli": 17, "benchmark": 17, "unprun": 17, "advantag": 17, "minim": 17, "pruned_transducer_stateless": [17, 22, 32, 35], "altern": 17, "though": 17, "transducer_stateless_modifi": 17, "option": [17, 23, 27, 28, 30], "pr": 17, "gb": 17, "ram": 17, "small": [17, 27, 28, 30], "tri": 17, "prob": [17, 34], "appli": [17, 32], "configur": [17, 23, 27, 28, 30], "c": [17, 18, 22, 24, 25, 30, 34, 35, 36], "lagz6hrcqxoigbfd5e0y3q": 17, "03t14": 17, "8477": 17, "sym": [17, 22, 35, 36], "beam_search": [17, 22, 35, 36], "decoding_method": 17, "beam_4": 17, "28": [17, 20, 23], "ensur": 17, "give": 17, "poor": 17, "531": [17, 18], "994": [17, 20], "176": [17, 20], "027": 17, "encoder_out_dim": 17, "f4fefe4882bc0ae59af951da3f47335d5495ef71": 17, "feb": 17, "50d2281": 17, "mar": 17, "0815224919": 17, "75d558775b": 17, "mmnv8": 17, "72": [17, 20], "878": [17, 28], "257": [17, 27, 28], "880": 17, "267": [17, 27, 28], "891": 17, "__floordiv__": 17, "length": [17, 35, 36], "x_len": 17, "163": [17, 20], "320": 17, "\u6ede": 17, "322": 17, "759": 17, "760": 17, "919": 17, "922": 17, "046": 17, "047": 17, "319": [17, 20], "214": [17, 20], "215": [17, 20, 23], "402": 17, "topk_hyp_index": 17, "topk_index": 17, "logit": 17, "583": [17, 28], "2000": 18, "lji9mwuorlow3jkdhxwk8a": 18, "13t11": 18, "4454": 18, "icefall_asr_aishell_tdnn_lstm_ctc": 18, "858": [18, 20], "389": [18, 20], "154": 18, "161": [18, 20], "536": 18, "171": [18, 20, 27, 28], "539": 18, "917": 18, "207": [18, 20], "129": 18, "\u505c\u6ede": 18, "222": [18, 20], "statelessx": [19, 21, 31, 32, 33], "zipform": [19, 21, 31, 33], "mmi": [19, 21], "blank": [19, 21], "skip": [19, 21, 22, 34, 35, 36], "ligru": [19, 26], "full": [20, 22, 24, 25, 34, 35, 36], "libri": [20, 22, 24, 25, 34, 35, 36], "960": [20, 22, 24, 25, 34, 35, 36], "subset": [20, 22, 24, 25, 34, 35, 36], "3x960": [20, 22, 24, 25, 34, 35, 36], "2880": [20, 22, 24, 25, 34, 35, 36], "lzgnetjwrxc3yghnmd4kpw": 20, "24t16": 20, "43": 20, "4540": 20, "sentenc": 20, "piec": 20, "And": [20, 22, 24, 25, 34, 35, 36], "neither": 20, "nor": 20, "vocab": 20, "work": 20, "5000": 20, "44": [20, 27, 28], "033": 20, "538": 20, "full_libri": 20, "406": 20, "464": 20, "548": 20, "776": 20, "652": [20, 30], "109226120": 20, "714": [20, 27], "473": 20, "944": 20, "1328": 20, "54": [20, 23, 27, 28], "443": [20, 23], "2563": 20, "56": [20, 27], "494": 20, "592": 20, "331": [20, 23], "1715": 20, "52576": 20, "1424": 20, "807": 20, "506": 20, "808": [20, 27], "522": 20, "362": 20, "565": 20, "1477": 20, "106": 20, "2922": 20, "208": 20, "4295": 20, "52343": 20, "396": 20, "3584": 20, "433": 20, "680": [20, 27], "_pickl": 20, "unpicklingerror": 20, "kei": 20, "hlg_modifi": 20, "g_4_gram": [20, 23, 27, 28], "106000": [20, 23], "496": [20, 23], "875": [20, 23], "212k": 20, "267440": [20, 23], "1253": [20, 23], "535k": 20, "83": [20, 23], "77200": [20, 23], "361": [20, 23], "154k": 20, "554": 20, "260": 20, "7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4": 20, "tue": 20, "8d93169": 20, "266": [20, 23], "268": [20, 23], "601": 20, "758": 20, "025": 20, "204": 20, "425": 20, "broffel": 20, "osom": 20, "427": 20, "723": 20, "775": 20, "881": 20, "352": 20, "234": 20, "384": 20, "whole": [20, 23, 27, 28, 35, 36], "ngram": [20, 23, 27, 28], "857": 20, "979": 20, "980": 20, "055": 20, "117": 20, "051": 20, "363": 20, "959": [20, 28], "546": 20, "598": 20, "599": [20, 23], "833": 20, "834": 20, "915": 20, "076": 20, "110": 20, "397": 20, "999": [20, 22, 35, 36], "concaten": 20, "bucket": 20, "sampler": 20, "1000": 20, "ctc_decod": 20, "ngram_lm_rescor": 20, "attention_rescor": 20, "kind": [20, 22, 24, 25, 34, 35, 36], "316": 20, "118": 20, "58": 20, "221": 20, "125": [20, 30], "136": 20, "144": 20, "159": [20, 30], "543": 20, "174": 20, "topo": 20, "547": 20, "729": 20, "111": 20, "702": 20, "703": 20, "545": 20, "122": 20, "280": 20, "135": [20, 30], "153": [20, 30], "945": 20, "475": 20, "191": [20, 27, 28], "398": 20, "199": [20, 23], "515": 20, "205": 20, "w": [20, 27, 28], "deseri": 20, "441": 20, "fsaclass": 20, "loadfsa": 20, "const": 20, "string": 20, "c10": 20, "ignor": 20, "attribut": 20, "dummi": 20, "589": 20, "attention_scal": 20, "656": 20, "162": 20, "169": [20, 27, 28], "188": 20, "624": 20, "519": [20, 28], "632": 20, "645": [20, 30], "243": 20, "970": 20, "303": 20, "179": 20, "suitabl": [22, 34, 35, 36], "pruned_transducer_stateless2": [22, 32, 35], "pruned_transducer_stateless4": [22, 32, 35], "pruned_transducer_stateless5": [22, 32, 35], "scroll": [22, 24, 25, 34, 35, 36], "scratch": [22, 24, 25, 34, 35, 36], "paper": [22, 34, 35, 36], "arxiv": [22, 34, 35, 36], "ab": [22, 34, 35, 36], "2206": [22, 34, 35, 36], "13236": [22, 34, 35, 36], "rework": [22, 32, 35], "daniel": [22, 35, 36], "joint": [22, 34, 35, 36], "contrari": [22, 34, 35, 36], "convent": [22, 34, 35, 36], "That": [22, 34, 35, 36], "recurr": [22, 34, 35, 36], "fp16": [22, 24, 25, 34, 35, 36], "half": [22, 35, 36], "2x": [22, 35, 36], "dimens": [22, 35, 36], "littl": [22, 35], "allow": [22, 35], "436000": [22, 24, 25, 34, 35, 36], "438000": [22, 24, 25, 34, 35, 36], "qogspbgsr8kzcrmmie9jgw": 22, "20t15": [22, 34, 35], "4468": [22, 34, 35], "210171": [22, 34, 35], "access": [22, 24, 25, 34, 35, 36], "6008": [22, 24, 25, 34, 35, 36], "localhost": [22, 24, 25, 34, 35, 36], "expos": [22, 24, 25, 34, 35, 36], "proxi": [22, 24, 25, 34, 35, 36], "bind_al": [22, 24, 25, 34, 35, 36], "suggest": [22, 24, 25, 34, 35, 36], "both": [22, 24, 25, 34, 35, 36], "lowest": [22, 24, 25, 34, 35, 36], "fast_beam_search": [22, 24, 34, 35, 36], "474000": [22, 34, 35, 36], "largest": [22, 35, 36], "posterior": [22, 24, 35, 36], "algorithm": [22, 35, 36], "pdf": [22, 25, 35, 36], "1211": [22, 35, 36], "3711": [22, 35, 36], "espnet": [22, 35, 36], "net": [22, 35, 36], "beam_search_transduc": [22, 35, 36], "basicli": [22, 35, 36], "topk": [22, 35, 36], "expand": [22, 35, 36], "mode": [22, 35, 36], "being": [22, 35, 36], "hardcod": [22, 35, 36], "composit": [22, 35, 36], "between": [22, 35, 36], "log_prob": [22, 35, 36], "hard": [22, 32, 35, 36], "2211": [22, 35, 36], "00484": [22, 35, 36], "rnnt": [22, 35, 36], "effici": [22, 35, 36], "fast_beam_search_lg": [22, 35, 36], "trivial": [22, 35, 36], "fast_beam_search_nbest": [22, 35, 36], "random_path": [22, 35, 36], "shortest": [22, 35, 36], "fast_beam_search_nbest_lg": [22, 35, 36], "logic": [22, 35, 36], "includ": [22, 24, 25, 34, 35, 36], "But": [22, 24, 25, 34, 35, 36], "smallest": [22, 34, 35, 36], "icefall_asr_librispeech_tdnn": 23, "lstm_ctc": 23, "flac": 23, "116k": 23, "140k": 23, "343k": 23, "164k": 23, "105k": 23, "174k": 23, "pretraind": 23, "168": 23, "170": 23, "581": 23, "584": [23, 28], "209": 23, "791": 23, "245": 23, "099": 23, "methond": [23, 27, 28], "725": 23, "403": 23, "631": 23, "010": 23, "guidanc": 24, "calcul": [24, 35, 36], "bigger": 24, "threshold": 24, "simpli": 24, "discard": 24, "prevent": 24, "convolut": [24, 32, 35], "similar": [24, 35, 36], "lconv": 24, "encourag": [24, 25, 34], "stabil": [24, 25], "doesn": 24, "warm": [24, 25], "pruned_transducer_stateless7_ctc_b": 24, "xyozukpeqm62hbilud4upa": [24, 25], "ctc_guild_decode_b": 24, "pretrained_ctc": 24, "jit_pretrained_ctc": 24, "yfyeung": 24, "wechat": 25, "zipformer_mmi": 25, "worker": [25, 34], "hp": 25, "zengwei": [25, 34], "tdnn_ligru_ctc": 27, "enough": [27, 28, 30], "luomingshuang": [27, 28], "icefall_asr_timit_tdnn_ligru_ctc": 27, "pretrained_average_9_25": 27, "fdhc0_si1559": [27, 28], "felc0_si756": [27, 28], "fmgd0_si1564": [27, 28], "ffprobe": [27, 28], "show_format": [27, 28], "nistspher": [27, 28], "database_id": [27, 28], "database_vers": [27, 28], "utterance_id": [27, 28], "dhc0_si1559": [27, 28], "sample_min": [27, 28], "4176": [27, 28], "sample_max": [27, 28], "5984": [27, 28], "bitrat": [27, 28], "258": [27, 28], "audio": [27, 28], "pcm_s16le": [27, 28], "hz": [27, 28], "s16": [27, 28], "256": [27, 28], "elc0_si756": [27, 28], "1546": [27, 28], "1989": [27, 28], "mgd0_si1564": [27, 28], "7626": [27, 28], "10573": [27, 28], "660": 27, "183": [27, 28], "695": 27, "697": 27, "210": [27, 28], "829": 27, "sil": [27, 28], "dh": [27, 28], "ih": [27, 28], "uw": [27, 28], "ah": [27, 28], "ii": [27, 28], "z": [27, 28], "aa": [27, 28], "ei": [27, 28], "dx": [27, 28], "uh": [27, 28], "ng": [27, 28], "th": [27, 28], "eh": [27, 28], "jh": [27, 28], "er": [27, 28], "ai": [27, 28], "hh": [27, 28], "aw": 27, "ae": [27, 28], "705": 27, "715": 27, "720": 27, "251": [27, 28], "348": 27, "ch": 27, "icefall_asr_timit_tdnn_lstm_ctc": 28, "pretrained_average_16_25": 28, "816": 28, "827": 28, "387": 28, "unk": 28, "739": 28, "971": 28, "977": 28, "978": 28, "981": 28, "ow": 28, "ykubhb5wrmosxykid1z9eg": 30, "23t23": 30, "sinc": [30, 34], "icefall_asr_yesno_tdnn": 30, "l_disambig": 30, "lexicon_disambig": 30, "arpa": 30, "0_0_0_1_0_0_0_1": 30, "0_0_1_0_0_0_1_0": 30, "0_0_1_0_0_1_1_1": 30, "0_0_1_0_1_0_0_1": 30, "0_0_1_1_0_0_0_1": 30, "0_0_1_1_0_1_1_0": 30, "0_0_1_1_1_0_0_0": 30, "0_0_1_1_1_1_0_0": 30, "0_1_0_0_0_1_0_0": 30, "0_1_0_0_1_0_1_0": 30, "0_1_0_1_0_0_0_0": 30, "0_1_0_1_1_1_0_0": 30, "0_1_1_0_0_1_1_1": 30, "0_1_1_1_0_0_1_0": 30, "0_1_1_1_1_0_1_0": 30, "1_0_0_0_0_0_0_0": 30, "1_0_0_0_0_0_1_1": 30, "1_0_0_1_0_1_1_1": 30, "1_0_1_1_0_1_1_1": 30, "1_0_1_1_1_1_0_1": 30, "1_1_0_0_0_1_1_1": 30, "1_1_0_0_1_0_1_1": 30, "1_1_0_1_0_1_0_0": 30, "1_1_0_1_1_0_0_1": 30, "1_1_0_1_1_1_1_0": 30, "1_1_1_0_0_1_0_1": 30, "1_1_1_0_1_0_1_0": 30, "1_1_1_1_0_0_1_0": 30, "1_1_1_1_1_0_0_0": 30, "1_1_1_1_1_1_1_1": 30, "54080": 30, "507": 30, "108k": 30, "No": 30, "ye": 30, "hebrew": 30, "NO": 30, "621": 30, "119": 30, "127": 30, "650": 30, "139": 30, "143": 30, "198": 30, "181": 30, "186": 30, "187": 30, "287": 30, "correctli": 30, "simplest": 30, "former": 32, "idea": 32, "achiev": 32, "mask": [32, 35, 36], "wenet": 32, "did": 32, "argument": 32, "adapt": 32, "complic": 32, "techniqu": 32, "bank": 32, "compon": 32, "memor": 32, "histori": 32, "introduc": 32, "variant": 32, "pruned_stateless_emformer_rnnt2": 32, "conv_emformer_transducer_stateless": 32, "convemform": 32, "ourself": 32, "mechan": 32, "conv_emformer_transducer_stateless2": 32, "onlin": 34, "lstm_transducer_stateless": 34, "architectur": 34, "lower": 34, "prepare_giga_speech": 34, "cj2vtpiwqhkn9q1tx6ptpg": 34, "hidden": 34, "1024": 34, "pnnx": 34, "submodul": 34, "updat": 34, "recurs": 34, "init": 34, "bdist_wheel": 34, "lh": 34, "dist": 34, "j4": 34, "pwd": 34, "third": 34, "param": 34, "abl": 34, "extern": 34, "stateless2": 34, "compar": 35, "dynam": [35, 36], "chunk": [35, 36], "causal": 35, "short": [35, 36], "2012": 35, "05481": 35, "flag": 35, "indic": [35, 36], "whether": 35, "must": 35, "sequenc": [35, 36], "uniformli": [35, 36], "most": [35, 36], "seen": [35, 36], "97vkxf80ru61cnp2alwzzg": 35, "streaming_decod": [35, 36], "acoust": [35, 36], "wise": [35, 36], "subsampl": [35, 36], "equal": [35, 36], "where": 35, "parallel": [35, 36], "bath": [35, 36], "parallelli": [35, 36], "seem": 35, "benefit": 35, "might": [35, 36], "mismatch": 35, "yourself": [35, 36], "mdoel": 35, "pruned_transducer_stateless7_stream": 36, "len": 36, "320m": 36, "550": 36, "scriptmodul": 36, "jit_trace_export": 36, "jit_trace_pretrain": 36, "task": 37}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"follow": 0, "code": 0, "style": 0, "contribut": [1, 3], "document": 1, "how": [2, 9, 11, 12, 13], "creat": [2, 8], "recip": [2, 37], "data": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "prepar": [2, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "train": [2, 5, 8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "decod": [2, 8, 9, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "pre": [2, 5, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "model": [2, 5, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "huggingfac": [4, 6], "space": 6, "youtub": [6, 8], "video": [6, 8], "icefal": [7, 8], "content": [7, 37], "instal": [8, 15, 17, 18, 20, 23, 27, 28], "0": 8, "pytorch": 8, "torchaudio": 8, "1": [8, 15, 17, 18, 20], "k2": 8, "2": [8, 15, 17, 18, 20], "lhots": 8, "3": [8, 15, 17, 20], "download": [8, 15, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 36], "exampl": [8, 15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "virtual": 8, "environ": 8, "activ": 8, "your": 8, "4": 8, "5": 8, "test": 8, "export": [9, 10, 11, 12, 13, 14, 22, 24, 25, 34, 35, 36], "state_dict": [9, 22, 24, 25, 34, 35, 36], "when": [9, 11, 12, 13], "us": [9, 11, 12, 13, 22, 24, 25, 34, 35, 36], "run": 9, "py": 9, "ncnn": [10, 34], "onnx": 11, "torch": [12, 13, 22, 24, 25, 34, 35, 36], "jit": [12, 13, 22, 24, 25, 34, 35, 36], "script": [12, 22, 24, 25, 35, 36], "trace": [13, 34, 36], "conform": [15, 20, 32], "ctc": [15, 18, 20, 23, 24, 27, 28, 30], "configur": [15, 18, 20, 22, 24, 25, 34, 35, 36], "option": [15, 18, 20, 22, 24, 25, 34, 35, 36], "log": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "usag": [15, 17, 18, 20, 22, 24, 25, 34, 35, 36], "case": [15, 17, 18, 20], "kaldifeat": [15, 17, 18, 20, 23, 27, 28, 30], "hlg": [15, 18, 20], "attent": [15, 20], "rescor": [15, 20], "colab": [15, 17, 18, 20, 23, 27, 28, 30], "notebook": [15, 17, 18, 20, 23, 27, 28, 30], "deploy": [15, 20], "c": [15, 20], "aishel": 16, "stateless": 17, "transduc": [17, 22, 34, 35, 36], "The": 17, "loss": 17, "todo": 17, "greedi": 17, "search": 17, "beam": 17, "modifi": 17, "tdnn": [18, 23, 27, 28, 30], "lstm": [18, 23, 28, 34], "non": 19, "stream": [19, 31, 32, 35, 36], "asr": [19, 31], "lm": 20, "comput": 20, "wer": 20, "n": 20, "gram": 20, "librispeech": [21, 33], "prune": [22, 35], "statelessx": [22, 35], "pretrain": [22, 24, 25, 34, 35, 36], "deploi": [22, 35, 36], "sherpa": [22, 35, 36], "infer": [23, 27, 28, 30], "zipform": [24, 25, 36], "blank": 24, "skip": 24, "mmi": 25, "timit": 26, "ligru": 27, "yesno": 29, "introduct": 32, "emform": 32, "which": 34, "simul": [35, 36], "real": [35, 36], "tabl": 37}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx": 57}, "alltitles": {"Follow the code style": [[0, "follow-the-code-style"]], "Contributing to Documentation": [[1, "contributing-to-documentation"]], "How to create a recipe": [[2, "how-to-create-a-recipe"]], "Data Preparation": [[2, "data-preparation"], [17, "data-preparation"]], "Training": [[2, "training"], [8, "training"], [15, "training"], [17, "training"], [18, "training"], [20, "training"], [22, "training"], [23, "training"], [24, "training"], [25, "training"], [27, "training"], [28, "training"], [30, "training"], [34, "training"], [35, "training"], [36, "training"]], "Decoding": [[2, "decoding"], [8, "decoding"], [15, "decoding"], [17, "decoding"], [18, "decoding"], [20, "decoding"], [22, "decoding"], [23, "decoding"], [24, "decoding"], [25, "decoding"], [27, "decoding"], [28, "decoding"], [30, "decoding"], [34, "decoding"], [35, "decoding"], [36, "decoding"]], "Pre-trained model": [[2, "pre-trained-model"]], "Contributing": [[3, "contributing"]], "Huggingface": [[4, "huggingface"]], "Pre-trained models": [[5, "pre-trained-models"]], "Huggingface spaces": [[6, "huggingface-spaces"]], "YouTube Video": [[6, "youtube-video"], [8, "youtube-video"]], "Icefall": [[7, "icefall"]], "Contents:": [[7, null]], "Installation": [[8, "installation"]], "(0) Install PyTorch and torchaudio": [[8, "install-pytorch-and-torchaudio"]], "(1) Install k2": [[8, "install-k2"]], "(2) Install lhotse": [[8, "install-lhotse"]], "(3) Download icefall": [[8, "download-icefall"]], "Installation example": [[8, "installation-example"]], "(1) Create a virtual environment": [[8, "create-a-virtual-environment"]], "(2) Activate your virtual environment": [[8, "activate-your-virtual-environment"]], "(3) Install k2": [[8, "id1"]], "(4) Install lhotse": [[8, "id2"]], "(5) Download icefall": [[8, "id3"]], "Test Your Installation": [[8, "test-your-installation"]], "Data preparation": [[8, "data-preparation"], [15, "data-preparation"], [18, "data-preparation"], [20, "data-preparation"], [22, "data-preparation"], [23, "data-preparation"], [24, "data-preparation"], [25, "data-preparation"], [27, "data-preparation"], [28, "data-preparation"], [30, "data-preparation"], [34, "data-preparation"], [35, "data-preparation"], [36, "data-preparation"]], "Export model.state_dict()": [[9, "export-model-state-dict"], [22, "export-model-state-dict"], [24, "export-model-state-dict"], [25, "export-model-state-dict"], [34, "export-model-state-dict"], [35, "export-model-state-dict"], [36, "export-model-state-dict"]], "When to use it": [[9, "when-to-use-it"], [11, "when-to-use-it"], [12, "when-to-use-it"], [13, "when-to-use-it"]], "How to export": [[9, "how-to-export"], [11, "how-to-export"], [12, "how-to-export"], [13, "how-to-export"]], "How to use the exported model": [[9, "how-to-use-the-exported-model"], [11, "how-to-use-the-exported-model"], [12, "how-to-use-the-exported-model"]], "Use the exported model to run decode.py": [[9, "use-the-exported-model-to-run-decode-py"]], "Export to ncnn": [[10, "export-to-ncnn"]], "Export to ONNX": [[11, "export-to-onnx"]], "Export model with torch.jit.script()": [[12, "export-model-with-torch-jit-script"]], "Export model with torch.jit.trace()": [[13, "export-model-with-torch-jit-trace"]], "How to use the exported models": [[13, "how-to-use-the-exported-models"]], "Model export": [[14, "model-export"]], "Conformer CTC": [[15, "conformer-ctc"], [20, "conformer-ctc"]], "Configurable options": [[15, "configurable-options"], [18, "configurable-options"], [20, "configurable-options"], [22, "configurable-options"], [24, "configurable-options"], [25, "configurable-options"], [34, "configurable-options"], [35, "configurable-options"], [36, "configurable-options"]], "Pre-configured options": [[15, "pre-configured-options"], [18, "pre-configured-options"], [20, "pre-configured-options"], [22, "pre-configured-options"], [24, "pre-configured-options"], [25, "pre-configured-options"], [34, "pre-configured-options"], [35, "pre-configured-options"], [36, "pre-configured-options"]], "Training logs": [[15, "training-logs"], [17, "training-logs"], [18, "training-logs"], [20, "training-logs"], [22, "training-logs"], [24, "training-logs"], [25, "training-logs"], [34, "training-logs"], [35, "training-logs"], [36, "training-logs"]], "Usage examples": [[15, "usage-examples"], [17, "usage-examples"], [18, "usage-examples"], [20, "usage-examples"]], "Case 1": [[15, "case-1"], [17, "case-1"], [18, "case-1"], [20, "case-1"]], "Case 2": [[15, "case-2"], [17, "case-2"], [18, "case-2"], [20, "case-2"]], "Case 3": [[15, "case-3"], [17, "case-3"], [20, "case-3"]], "Pre-trained Model": [[15, "pre-trained-model"], [17, "pre-trained-model"], [18, "pre-trained-model"], [20, "pre-trained-model"], [23, "pre-trained-model"], [27, "pre-trained-model"], [28, "pre-trained-model"], [30, "pre-trained-model"]], "Install kaldifeat": [[15, "install-kaldifeat"], [17, "install-kaldifeat"], [18, "install-kaldifeat"], [20, "install-kaldifeat"], [23, "install-kaldifeat"], [27, "install-kaldifeat"], [28, "install-kaldifeat"]], "Download the pre-trained model": [[15, "download-the-pre-trained-model"], [17, "download-the-pre-trained-model"], [18, "download-the-pre-trained-model"], [20, "download-the-pre-trained-model"], [23, "download-the-pre-trained-model"], [27, "download-the-pre-trained-model"], [28, "download-the-pre-trained-model"], [30, "download-the-pre-trained-model"]], "Usage": [[15, "usage"], [17, "usage"], [18, "usage"], [20, "usage"]], "CTC decoding": [[15, "ctc-decoding"], [20, "ctc-decoding"], [20, "id2"]], "HLG decoding": [[15, "hlg-decoding"], [15, "id2"], [18, "hlg-decoding"], [20, "hlg-decoding"], [20, "id3"]], "HLG decoding + attention decoder rescoring": [[15, "hlg-decoding-attention-decoder-rescoring"]], "Colab notebook": [[15, "colab-notebook"], [17, "colab-notebook"], [18, "colab-notebook"], [20, "colab-notebook"], [23, "colab-notebook"], [27, "colab-notebook"], [28, "colab-notebook"], [30, "colab-notebook"]], "Deployment with C++": [[15, "deployment-with-c"], [20, "deployment-with-c"]], "aishell": [[16, "aishell"]], "Stateless Transducer": [[17, "stateless-transducer"]], "The Model": [[17, "the-model"]], "The Loss": [[17, "the-loss"]], "Todo": [[17, "id1"]], "Greedy search": [[17, "greedy-search"]], "Beam search": [[17, "beam-search"]], "Modified Beam search": [[17, "modified-beam-search"]], "TDNN-LSTM CTC": [[18, "tdnn-lstm-ctc"]], "Non Streaming ASR": [[19, "non-streaming-asr"]], "HLG decoding + LM rescoring": [[20, "hlg-decoding-lm-rescoring"]], "HLG decoding + LM rescoring + attention decoder rescoring": [[20, "hlg-decoding-lm-rescoring-attention-decoder-rescoring"]], "Compute WER with the pre-trained model": [[20, "compute-wer-with-the-pre-trained-model"]], "HLG decoding + n-gram LM rescoring": [[20, "hlg-decoding-n-gram-lm-rescoring"]], "HLG decoding + n-gram LM rescoring + attention decoder rescoring": [[20, "hlg-decoding-n-gram-lm-rescoring-attention-decoder-rescoring"]], "LibriSpeech": [[21, "librispeech"], [33, "librispeech"]], "Pruned transducer statelessX": [[22, "pruned-transducer-statelessx"], [35, "pruned-transducer-statelessx"]], "Usage example": [[22, "usage-example"], [24, "usage-example"], [25, "usage-example"], [34, "usage-example"], [35, "usage-example"], [36, "usage-example"]], "Export Model": [[22, "export-model"], [35, "export-model"], [36, "export-model"]], "Export model using torch.jit.script()": [[22, "export-model-using-torch-jit-script"], [24, "export-model-using-torch-jit-script"], [25, "export-model-using-torch-jit-script"], [35, "export-model-using-torch-jit-script"], [36, "export-model-using-torch-jit-script"]], "Download pretrained models": [[22, "download-pretrained-models"], [24, "download-pretrained-models"], [25, "download-pretrained-models"], [34, "download-pretrained-models"], [35, "download-pretrained-models"], [36, "download-pretrained-models"]], "Deploy with Sherpa": [[22, "deploy-with-sherpa"], [35, "deploy-with-sherpa"], [36, "deploy-with-sherpa"]], "TDNN-LSTM-CTC": [[23, "tdnn-lstm-ctc"], [28, "tdnn-lstm-ctc"]], "Inference with a pre-trained model": [[23, "inference-with-a-pre-trained-model"], [27, "inference-with-a-pre-trained-model"], [28, "inference-with-a-pre-trained-model"], [30, "inference-with-a-pre-trained-model"]], "Zipformer CTC Blank Skip": [[24, "zipformer-ctc-blank-skip"]], "Export models": [[24, "export-models"], [25, "export-models"], [34, "export-models"]], "Zipformer MMI": [[25, "zipformer-mmi"]], "TIMIT": [[26, "timit"]], "TDNN-LiGRU-CTC": [[27, "tdnn-ligru-ctc"]], "YesNo": [[29, "yesno"]], "TDNN-CTC": [[30, "tdnn-ctc"]], "Download kaldifeat": [[30, "download-kaldifeat"]], "Streaming ASR": [[31, "streaming-asr"]], "Introduction": [[32, "introduction"]], "Streaming Conformer": [[32, "streaming-conformer"]], "Streaming Emformer": [[32, "streaming-emformer"]], "LSTM Transducer": [[34, "lstm-transducer"]], "Which model to use": [[34, "which-model-to-use"]], "Export model using torch.jit.trace()": [[34, "export-model-using-torch-jit-trace"], [36, "export-model-using-torch-jit-trace"]], "Export model for ncnn": [[34, "export-model-for-ncnn"]], "Simulate streaming decoding": [[35, "simulate-streaming-decoding"], [36, "simulate-streaming-decoding"]], "Real streaming decoding": [[35, "real-streaming-decoding"], [36, "real-streaming-decoding"]], "Zipformer Transducer": [[36, "zipformer-transducer"]], "Recipes": [[37, "recipes"]], "Table of Contents": [[37, null]]}, "indexentries": {}})
\ No newline at end of file