diff --git a/_sources/docker/intro.rst.txt b/_sources/docker/intro.rst.txt
index 2f4bdb3f6..f3d2b0727 100644
--- a/_sources/docker/intro.rst.txt
+++ b/_sources/docker/intro.rst.txt
@@ -34,6 +34,8 @@ which will give you something like below:
.. code-block:: bash
+ "torch2.3.1-cuda12.1"
+ "torch2.3.1-cuda11.8"
"torch2.2.2-cuda12.1"
"torch2.2.2-cuda11.8"
"torch2.2.1-cuda12.1"
diff --git a/_sources/fst-based-forced-alignment/diff.rst.txt b/_sources/fst-based-forced-alignment/diff.rst.txt
new file mode 100644
index 000000000..56b6c430e
--- /dev/null
+++ b/_sources/fst-based-forced-alignment/diff.rst.txt
@@ -0,0 +1,41 @@
+Two approaches
+==============
+
+Two approaches for FST-based forced alignment will be described:
+
+ - `Kaldi`_-based
+ - `k2`_-based
+
+Note that the `Kaldi`_-based approach does not depend on `Kaldi`_ at all.
+That is, you don't need to install `Kaldi`_ in order to use it. Instead,
+we use `kaldi-decoder`_, which has ported the C++ decoding code from `Kaldi`_
+without depending on it.
+
+Differences between the two approaches
+--------------------------------------
+
+The following table compares the differences between the two approaches.
+
+.. list-table::
+
+ * - Features
+ - `Kaldi`_-based
+ - `k2`_-based
+ * - Support CUDA
+ - No
+ - Yes
+ * - Support CPU
+ - Yes
+ - Yes
+ * - Support batch processing
+ - No
+ - Yes on CUDA; No on CPU
+ * - Support streaming models
+ - Yes
+ - No
+ * - Support C++ APIs
+ - Yes
+ - Yes
+ * - Support Python APIs
+ - Yes
+ - Yes
diff --git a/_sources/fst-based-forced-alignment/index.rst.txt b/_sources/fst-based-forced-alignment/index.rst.txt
new file mode 100644
index 000000000..92a05faaa
--- /dev/null
+++ b/_sources/fst-based-forced-alignment/index.rst.txt
@@ -0,0 +1,18 @@
+FST-based forced alignment
+==========================
+
+This section describes how to perform **FST-based** ``forced alignment`` with models
+trained by `CTC`_ loss.
+
+We use `CTC FORCED ALIGNMENT API TUTORIAL `_
+from `torchaudio`_ as a reference in this section.
+
+Different from `torchaudio`_, we use an ``FST``-based approach.
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ diff
+ kaldi-based
+ k2-based
diff --git a/_sources/fst-based-forced-alignment/k2-based.rst.txt b/_sources/fst-based-forced-alignment/k2-based.rst.txt
new file mode 100644
index 000000000..373e49f3e
--- /dev/null
+++ b/_sources/fst-based-forced-alignment/k2-based.rst.txt
@@ -0,0 +1,4 @@
+k2-based forced alignment
+=========================
+
+TODO(fangjun)
diff --git a/_sources/fst-based-forced-alignment/kaldi-based.rst.txt b/_sources/fst-based-forced-alignment/kaldi-based.rst.txt
new file mode 100644
index 000000000..69b6a665b
--- /dev/null
+++ b/_sources/fst-based-forced-alignment/kaldi-based.rst.txt
@@ -0,0 +1,712 @@
+Kaldi-based forced alignment
+============================
+
+This section describes in detail how to use `kaldi-decoder`_
+for **FST-based** ``forced alignment`` with models trained by `CTC`_ loss.
+
+.. hint::
+
+ We have a colab notebook walking you through this section step by step.
+
+ |kaldi-based forced alignment colab notebook|
+
+ .. |kaldi-based forced alignment colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://github.com/k2-fsa/colab/blob/master/icefall/ctc_forced_alignment_fst_based_kaldi.ipynb
+
+Prepare the environment
+-----------------------
+
+Before you continue, make sure you have setup `icefall`_ by following :ref:`install icefall`.
+
+.. hint::
+
+ You don't need to install `Kaldi`_. We will ``NOT`` use `Kaldi`_ below.
+
+Get the test data
+-----------------
+
+We use the test wave
+from `CTC FORCED ALIGNMENT API TUTORIAL `_
+
+.. code-block:: python3
+
+ import torchaudio
+
+ # Download test wave
+ speech_file = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
+ print(speech_file)
+ waveform, sr = torchaudio.load(speech_file)
+ transcript = "i had that curiosity beside me at this moment".split()
+ print(waveform.shape, sr)
+
+ assert waveform.ndim == 2
+ assert waveform.shape[0] == 1
+ assert sr == 16000
+
+The test wave is downloaded to::
+
+ $HOME/.cache/torch/hub/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
+
+.. raw:: html
+
+
+
+ Wave filename
+ Content
+ Text
+
+
+ Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+ i had that curiosity beside me at this moment
+
+
+
+
+We use the test model
+from `CTC FORCED ALIGNMENT API TUTORIAL `_
+
+.. code-block:: python3
+
+ import torch
+
+ bundle = torchaudio.pipelines.MMS_FA
+
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ model = bundle.get_model(with_star=False).to(device)
+
+The model is downloaded to::
+
+ $HOME/.cache/torch/hub/checkpoints/model.pt
+
+Compute log_probs
+-----------------
+
+.. code-block:: bash
+
+ with torch.inference_mode():
+ emission, _ = model(waveform.to(device))
+ print(emission.shape)
+
+It should print::
+
+ torch.Size([1, 169, 28])
+
+Create token2id and id2token
+----------------------------
+
+.. code-block:: python3
+
+ token2id = bundle.get_dict(star=None)
+ id2token = {i:t for t, i in token2id.items()}
+ token2id[""] = 0
+ del token2id["-"]
+
+Create word2id and id2word
+--------------------------
+
+.. code-block:: python3
+
+ words = list(set(transcript))
+ word2id = dict()
+ word2id['eps'] = 0
+ for i, w in enumerate(words):
+ word2id[w] = i + 1
+
+ id2word = {i:w for w, i in word2id.items()}
+
+Note that we only use words from the transcript of the test wave.
+
+Generate lexicon-related files
+------------------------------
+
+We use the code below to generate the following 4 files:
+
+ - ``lexicon.txt``
+ - ``tokens.txt``
+ - ``words.txt``
+ - ``lexicon_disambig.txt``
+
+.. caution::
+
+ ``words.txt`` contains only words from the transcript of the test wave.
+
+.. code-block:: python3
+
+ from prepare_lang import add_disambig_symbols
+
+ lexicon = [(w, list(w)) for w in word2id if w != "eps"]
+ lexicon_disambig, max_disambig_id = add_disambig_symbols(lexicon)
+
+ with open('lexicon.txt', 'w', encoding='utf-8') as f:
+ for w, tokens in lexicon:
+ f.write(f"{w} {' '.join(tokens)}\n")
+
+ with open('lexicon_disambig.txt', 'w', encoding='utf-8') as f:
+ for w, tokens in lexicon_disambig:
+ f.write(f"{w} {' '.join(tokens)}\n")
+
+ with open('tokens.txt', 'w', encoding='utf-8') as f:
+ for t, i in token2id.items():
+ if t == '-':
+ t = ""
+ f.write(f"{t} {i}\n")
+
+ for k in range(max_disambig_id + 2):
+ f.write(f"#{k} {len(token2id) + k}\n")
+
+ with open('words.txt', 'w', encoding='utf-8') as f:
+ for w, i in word2id.items():
+ f.write(f"{w} {i}\n")
+ f.write(f'#0 {len(word2id)}\n')
+
+
+To give you an idea about what the generated files look like::
+
+ head -n 50 lexicon.txt lexicon_disambig.txt tokens.txt words.txt
+
+prints::
+
+ ==> lexicon.txt <==
+ moment m o m e n t
+ beside b e s i d e
+ i i
+ this t h i s
+ curiosity c u r i o s i t y
+ had h a d
+ that t h a t
+ at a t
+ me m e
+
+ ==> lexicon_disambig.txt <==
+ moment m o m e n t
+ beside b e s i d e
+ i i
+ this t h i s
+ curiosity c u r i o s i t y
+ had h a d
+ that t h a t
+ at a t
+ me m e
+
+ ==> tokens.txt <==
+ a 1
+ i 2
+ e 3
+ n 4
+ o 5
+ u 6
+ t 7
+ s 8
+ r 9
+ m 10
+ k 11
+ l 12
+ d 13
+ g 14
+ h 15
+ y 16
+ b 17
+ p 18
+ w 19
+ c 20
+ v 21
+ j 22
+ z 23
+ f 24
+ ' 25
+ q 26
+ x 27
+ 0
+ #0 28
+ #1 29
+
+ ==> words.txt <==
+ eps 0
+ moment 1
+ beside 2
+ i 3
+ this 4
+ curiosity 5
+ had 6
+ that 7
+ at 8
+ me 9
+ #0 10
+
+.. note::
+
+ This test model uses characters as modeling unit. If you use other types of
+ modeling unit, the same code can be used without any change.
+
+Convert transcript to an FST graph
+----------------------------------
+
+.. code-block:: bash
+
+ egs/librispeech/ASR/local/prepare_lang_fst.py --lang-dir ./
+
+The above command should generate two files ``H.fst`` and ``HL.fst``. We will
+use ``HL.fst`` below::
+
+ -rw-r--r-- 1 root root 13K Jun 12 08:28 H.fst
+ -rw-r--r-- 1 root root 3.7K Jun 12 08:28 HL.fst
+
+Force aligner
+-------------
+
+Now, everything is ready. We can use the following code to get forced alignments.
+
+.. code-block:: python3
+
+ from kaldi_decoder import DecodableCtc, FasterDecoder, FasterDecoderOptions
+ import kaldifst
+
+ def force_align():
+ HL = kaldifst.StdVectorFst.read("./HL.fst")
+ decodable = DecodableCtc(emission[0].contiguous().cpu().numpy())
+ decoder_opts = FasterDecoderOptions(max_active=3000)
+ decoder = FasterDecoder(HL, decoder_opts)
+ decoder.decode(decodable)
+ if not decoder.reached_final():
+ print(f"failed to decode xxx")
+ return None
+ ok, best_path = decoder.get_best_path()
+
+ (
+ ok,
+ isymbols_out,
+ osymbols_out,
+ total_weight,
+ ) = kaldifst.get_linear_symbol_sequence(best_path)
+ if not ok:
+ print(f"failed to get linear symbol sequence for xxx")
+ return None
+
+ # We need to use i-1 here since we have incremented tokens during
+ # HL construction
+ alignment = [i-1 for i in isymbols_out]
+ return alignment
+
+ alignment = force_align()
+
+ for i, a in enumerate(alignment):
+ print(i, id2token[a])
+
+The output should be identical to
+``_.
+
+For ease of reference, we list the output below::
+
+ 0 -
+ 1 -
+ 2 -
+ 3 -
+ 4 -
+ 5 -
+ 6 -
+ 7 -
+ 8 -
+ 9 -
+ 10 -
+ 11 -
+ 12 -
+ 13 -
+ 14 -
+ 15 -
+ 16 -
+ 17 -
+ 18 -
+ 19 -
+ 20 -
+ 21 -
+ 22 -
+ 23 -
+ 24 -
+ 25 -
+ 26 -
+ 27 -
+ 28 -
+ 29 -
+ 30 -
+ 31 -
+ 32 i
+ 33 -
+ 34 -
+ 35 h
+ 36 h
+ 37 a
+ 38 -
+ 39 -
+ 40 -
+ 41 d
+ 42 -
+ 43 -
+ 44 t
+ 45 h
+ 46 -
+ 47 a
+ 48 -
+ 49 -
+ 50 t
+ 51 -
+ 52 -
+ 53 -
+ 54 c
+ 55 -
+ 56 -
+ 57 -
+ 58 u
+ 59 u
+ 60 -
+ 61 -
+ 62 -
+ 63 r
+ 64 -
+ 65 i
+ 66 -
+ 67 -
+ 68 -
+ 69 -
+ 70 -
+ 71 -
+ 72 o
+ 73 -
+ 74 -
+ 75 -
+ 76 -
+ 77 -
+ 78 -
+ 79 s
+ 80 -
+ 81 -
+ 82 -
+ 83 i
+ 84 -
+ 85 t
+ 86 -
+ 87 -
+ 88 y
+ 89 -
+ 90 -
+ 91 -
+ 92 -
+ 93 b
+ 94 -
+ 95 e
+ 96 -
+ 97 -
+ 98 -
+ 99 -
+ 100 -
+ 101 s
+ 102 -
+ 103 -
+ 104 -
+ 105 -
+ 106 -
+ 107 -
+ 108 -
+ 109 -
+ 110 i
+ 111 -
+ 112 -
+ 113 d
+ 114 e
+ 115 -
+ 116 m
+ 117 -
+ 118 -
+ 119 e
+ 120 -
+ 121 -
+ 122 -
+ 123 -
+ 124 a
+ 125 -
+ 126 -
+ 127 t
+ 128 -
+ 129 t
+ 130 h
+ 131 -
+ 132 i
+ 133 -
+ 134 -
+ 135 -
+ 136 s
+ 137 -
+ 138 -
+ 139 -
+ 140 -
+ 141 m
+ 142 -
+ 143 -
+ 144 o
+ 145 -
+ 146 -
+ 147 -
+ 148 m
+ 149 -
+ 150 -
+ 151 e
+ 152 -
+ 153 n
+ 154 -
+ 155 t
+ 156 -
+ 157 -
+ 158 -
+ 159 -
+ 160 -
+ 161 -
+ 162 -
+ 163 -
+ 164 -
+ 165 -
+ 166 -
+ 167 -
+ 168 -
+
+To merge tokens, we use::
+
+ from icefall.ctc import merge_tokens
+ token_spans = merge_tokens(alignment)
+ for span in token_spans:
+ print(id2token[span.token], span.start, span.end)
+
+The output is given below::
+
+ i 32 33
+ h 35 37
+ a 37 38
+ d 41 42
+ t 44 45
+ h 45 46
+ a 47 48
+ t 50 51
+ c 54 55
+ u 58 60
+ r 63 64
+ i 65 66
+ o 72 73
+ s 79 80
+ i 83 84
+ t 85 86
+ y 88 89
+ b 93 94
+ e 95 96
+ s 101 102
+ i 110 111
+ d 113 114
+ e 114 115
+ m 116 117
+ e 119 120
+ a 124 125
+ t 127 128
+ t 129 130
+ h 130 131
+ i 132 133
+ s 136 137
+ m 141 142
+ o 144 145
+ m 148 149
+ e 151 152
+ n 153 154
+ t 155 156
+
+All of the code below is copied and modified
+from ``_.
+
+Segment each word using the computed alignments
+-----------------------------------------------
+
+.. code-block:: python3
+
+ def unflatten(list_, lengths):
+ assert len(list_) == sum(lengths)
+ i = 0
+ ret = []
+ for l in lengths:
+ ret.append(list_[i : i + l])
+ i += l
+ return ret
+
+
+ word_spans = unflatten(token_spans, [len(word) for word in transcript])
+ print(word_spans)
+
+The output is::
+
+ [[TokenSpan(token=2, start=32, end=33)],
+ [TokenSpan(token=15, start=35, end=37), TokenSpan(token=1, start=37, end=38), TokenSpan(token=13, start=41, end=42)],
+ [TokenSpan(token=7, start=44, end=45), TokenSpan(token=15, start=45, end=46), TokenSpan(token=1, start=47, end=48), TokenSpan(token=7, start=50, end=51)],
+ [TokenSpan(token=20, start=54, end=55), TokenSpan(token=6, start=58, end=60), TokenSpan(token=9, start=63, end=64), TokenSpan(token=2, start=65, end=66), TokenSpan(token=5, start=72, end=73), TokenSpan(token=8, start=79, end=80), TokenSpan(token=2, start=83, end=84), TokenSpan(token=7, start=85, end=86), TokenSpan(token=16, start=88, end=89)],
+ [TokenSpan(token=17, start=93, end=94), TokenSpan(token=3, start=95, end=96), TokenSpan(token=8, start=101, end=102), TokenSpan(token=2, start=110, end=111), TokenSpan(token=13, start=113, end=114), TokenSpan(token=3, start=114, end=115)],
+ [TokenSpan(token=10, start=116, end=117), TokenSpan(token=3, start=119, end=120)],
+ [TokenSpan(token=1, start=124, end=125), TokenSpan(token=7, start=127, end=128)],
+ [TokenSpan(token=7, start=129, end=130), TokenSpan(token=15, start=130, end=131), TokenSpan(token=2, start=132, end=133), TokenSpan(token=8, start=136, end=137)],
+ [TokenSpan(token=10, start=141, end=142), TokenSpan(token=5, start=144, end=145), TokenSpan(token=10, start=148, end=149), TokenSpan(token=3, start=151, end=152), TokenSpan(token=4, start=153, end=154), TokenSpan(token=7, start=155, end=156)]
+ ]
+
+
+.. code-block:: python3
+
+ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sample_rate):
+ ratio = waveform.size(1) / num_frames
+ x0 = int(ratio * spans[0].start)
+ x1 = int(ratio * spans[-1].end)
+ print(f"{transcript} {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec")
+ segment = waveform[:, x0:x1]
+ return IPython.display.Audio(segment.numpy(), rate=sample_rate)
+ num_frames = emission.size(1)
+
+.. code-block:: python3
+
+ preview_word(waveform, word_spans[0], num_frames, transcript[0])
+ preview_word(waveform, word_spans[1], num_frames, transcript[1])
+ preview_word(waveform, word_spans[2], num_frames, transcript[2])
+ preview_word(waveform, word_spans[3], num_frames, transcript[3])
+ preview_word(waveform, word_spans[4], num_frames, transcript[4])
+ preview_word(waveform, word_spans[5], num_frames, transcript[5])
+ preview_word(waveform, word_spans[6], num_frames, transcript[6])
+ preview_word(waveform, word_spans[7], num_frames, transcript[7])
+ preview_word(waveform, word_spans[8], num_frames, transcript[8])
+
+The segmented wave of each word along with its time stamp is given below:
+
+.. raw:: html
+
+
+
+ Word
+ Time
+ Wave
+
+
+ i
+ 0.644 - 0.664 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ had
+ 0.704 - 0.845 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ that
+ 0.885 - 1.026 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ curiosity
+ 1.086 - 1.790 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ beside
+ 1.871 - 2.314 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ me
+ 2.334 - 2.414 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ at
+ 2.495 - 2.575 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ this
+ 2.595 - 2.756 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ moment
+ 2.837 - 3.138 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+
+We repost the whole wave below for ease of reference:
+
+.. raw:: html
+
+
+
+ Wave filename
+ Content
+ Text
+
+
+ Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+ i had that curiosity beside me at this moment
+
+
+
+
+Summary
+-------
+
+Congratulations! You have succeeded in using the FST-based approach to
+compute alignment of a test wave.
diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt
index fb539d3f2..d46a4038f 100644
--- a/_sources/index.rst.txt
+++ b/_sources/index.rst.txt
@@ -25,7 +25,7 @@ speech recognition recipes using `k2 `_.
docker/index
faqs
model-export/index
-
+ fst-based-forced-alignment/index
.. toctree::
:maxdepth: 3
@@ -40,5 +40,5 @@ speech recognition recipes using `k2 `_.
.. toctree::
:maxdepth: 2
-
+
decoding-with-langugage-models/index
diff --git a/_sources/model-export/export-ncnn-conv-emformer.rst.txt b/_sources/model-export/export-ncnn-conv-emformer.rst.txt
index 93392aee7..4cdc25ee6 100644
--- a/_sources/model-export/export-ncnn-conv-emformer.rst.txt
+++ b/_sources/model-export/export-ncnn-conv-emformer.rst.txt
@@ -15,8 +15,8 @@ We will show you step by step how to export it to `ncnn`_ and run it with `sherp
.. caution::
- Please use a more recent version of PyTorch. For instance, ``torch 1.8``
- may ``not`` work.
+ ``torch > 2.0`` may not work. If you get errors while building pnnx, please switch
+ to ``torch < 2.0``.
1. Download the pre-trained model
---------------------------------
diff --git a/_sources/model-export/export-ncnn-lstm.rst.txt b/_sources/model-export/export-ncnn-lstm.rst.txt
index 310c3d8e4..ccf522dec 100644
--- a/_sources/model-export/export-ncnn-lstm.rst.txt
+++ b/_sources/model-export/export-ncnn-lstm.rst.txt
@@ -15,8 +15,8 @@ We will show you step by step how to export it to `ncnn`_ and run it with `sherp
.. caution::
- Please use a more recent version of PyTorch. For instance, ``torch 1.8``
- may ``not`` work.
+ ``torch > 2.0`` may not work. If you get errors while building pnnx, please switch
+ to ``torch < 2.0``.
1. Download the pre-trained model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/_sources/model-export/export-ncnn-zipformer.rst.txt b/_sources/model-export/export-ncnn-zipformer.rst.txt
index a5845b0e4..51fc6c8e5 100644
--- a/_sources/model-export/export-ncnn-zipformer.rst.txt
+++ b/_sources/model-export/export-ncnn-zipformer.rst.txt
@@ -15,8 +15,8 @@ We will show you step by step how to export it to `ncnn`_ and run it with `sherp
.. caution::
- Please use a more recent version of PyTorch. For instance, ``torch 1.8``
- may ``not`` work.
+ ``torch > 2.0`` may not work. If you get errors while building pnnx, please switch
+ to ``torch < 2.0``.
1. Download the pre-trained model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/_static/kaldi-align/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav b/_static/kaldi-align/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
new file mode 100644
index 000000000..004a33532
Binary files /dev/null and b/_static/kaldi-align/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav differ
diff --git a/_static/kaldi-align/at.wav b/_static/kaldi-align/at.wav
new file mode 100644
index 000000000..caad1178c
Binary files /dev/null and b/_static/kaldi-align/at.wav differ
diff --git a/_static/kaldi-align/beside.wav b/_static/kaldi-align/beside.wav
new file mode 100644
index 000000000..5576d84c2
Binary files /dev/null and b/_static/kaldi-align/beside.wav differ
diff --git a/_static/kaldi-align/curiosity.wav b/_static/kaldi-align/curiosity.wav
new file mode 100644
index 000000000..32d106d7b
Binary files /dev/null and b/_static/kaldi-align/curiosity.wav differ
diff --git a/_static/kaldi-align/had.wav b/_static/kaldi-align/had.wav
new file mode 100644
index 000000000..4d50ec80f
Binary files /dev/null and b/_static/kaldi-align/had.wav differ
diff --git a/_static/kaldi-align/i.wav b/_static/kaldi-align/i.wav
new file mode 100644
index 000000000..9db292a47
Binary files /dev/null and b/_static/kaldi-align/i.wav differ
diff --git a/_static/kaldi-align/me.wav b/_static/kaldi-align/me.wav
new file mode 100644
index 000000000..e4f16f17c
Binary files /dev/null and b/_static/kaldi-align/me.wav differ
diff --git a/_static/kaldi-align/moment.wav b/_static/kaldi-align/moment.wav
new file mode 100644
index 000000000..eb60e44fe
Binary files /dev/null and b/_static/kaldi-align/moment.wav differ
diff --git a/_static/kaldi-align/that.wav b/_static/kaldi-align/that.wav
new file mode 100644
index 000000000..ddf0f666b
Binary files /dev/null and b/_static/kaldi-align/that.wav differ
diff --git a/_static/kaldi-align/this.wav b/_static/kaldi-align/this.wav
new file mode 100644
index 000000000..1c8bed956
Binary files /dev/null and b/_static/kaldi-align/this.wav differ
diff --git a/contributing/code-style.html b/contributing/code-style.html
index 74d418a75..7fac7b2bb 100644
--- a/contributing/code-style.html
+++ b/contributing/code-style.html
@@ -51,6 +51,7 @@
Docker
Frequently Asked Questions (FAQs)
Model export
+FST-based forced alignment
Recipes
@@ -136,7 +137,9 @@ docker images:
which will give you something like below:
-"torch2.2.2-cuda12.1"
+"torch2.3.1-cuda12.1"
+"torch2.3.1-cuda11.8"
+"torch2.2.2-cuda12.1"
"torch2.2.2-cuda11.8"
"torch2.2.1-cuda12.1"
"torch2.2.1-cuda11.8"
diff --git a/faqs.html b/faqs.html
index e5315dce8..43ae9d548 100644
--- a/faqs.html
+++ b/faqs.html
@@ -56,6 +56,7 @@
Model export
+
FST-based forced alignment
Recipes
diff --git a/fst-based-forced-alignment/diff.html b/fst-based-forced-alignment/diff.html
new file mode 100644
index 000000000..3d5764290
--- /dev/null
+++ b/fst-based-forced-alignment/diff.html
@@ -0,0 +1,182 @@
+
+
+
+
+
+
+ Two approaches — icefall 0.1 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ icefall
+
+
+
+
+
+
+
+
+
+Two approaches
+Two approaches for FST-based forced alignment will be described:
+
+
+Note that the Kaldi -based approach does not depend on Kaldi at all.
+That is, you don’t need to install Kaldi in order to use it. Instead,
+we use kaldi-decoder , which has ported the C++ decoding code from Kaldi
+without depending on it.
+
+Differences between the two approaches
+The following table compares the differences between the two approaches.
+
+
+Features
+Kaldi -based
+k2 -based
+
+Support CUDA
+No
+Yes
+
+Support CPU
+Yes
+Yes
+
+Support batch processing
+No
+Yes on CUDA; No on CPU
+
+Support streaming models
+Yes
+No
+
+Support C++ APIs
+Yes
+Yes
+
+Support Python APIs
+Yes
+Yes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/fst-based-forced-alignment/index.html b/fst-based-forced-alignment/index.html
new file mode 100644
index 000000000..3b27b2e82
--- /dev/null
+++ b/fst-based-forced-alignment/index.html
@@ -0,0 +1,159 @@
+
+
+
+
+
+
+ FST-based forced alignment — icefall 0.1 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ icefall
+
+
+
+
+
+
+
+
+
+FST-based forced alignment
+This section describes how to perform FST-based forced alignment
with models
+trained by CTC loss.
+We use CTC FORCED ALIGNMENT API TUTORIAL
+from torchaudio as a reference in this section.
+Different from torchaudio , we use an FST
-based approach.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/fst-based-forced-alignment/k2-based.html b/fst-based-forced-alignment/k2-based.html
new file mode 100644
index 000000000..ad25d7eb9
--- /dev/null
+++ b/fst-based-forced-alignment/k2-based.html
@@ -0,0 +1,133 @@
+
+
+
+
+
+
+ k2-based forced alignment — icefall 0.1 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ icefall
+
+
+
+
+
+
+
+
+
+k2-based forced alignment
+TODO(fangjun)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/fst-based-forced-alignment/kaldi-based.html b/fst-based-forced-alignment/kaldi-based.html
new file mode 100644
index 000000000..a1ec582e8
--- /dev/null
+++ b/fst-based-forced-alignment/kaldi-based.html
@@ -0,0 +1,816 @@
+
+
+
+
+
+
+ Kaldi-based forced alignment — icefall 0.1 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ icefall
+
+
+
+
+
+
+
+
+
+Kaldi-based forced alignment
+This section describes in detail how to use kaldi-decoder
+for FST-based forced alignment
with models trained by CTC loss.
+
+
Hint
+
We have a colab notebook walking you through this section step by step.
+
+
+
+Prepare the environment
+Before you continue, make sure you have setup icefall by following Installation .
+
+
Hint
+
You don’t need to install Kaldi . We will NOT
use Kaldi below.
+
+
+
+Get the test data
+We use the test wave
+from CTC FORCED ALIGNMENT API TUTORIAL
+import torchaudio
+
+# Download test wave
+speech_file = torchaudio . utils . download_asset ( "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" )
+print ( speech_file )
+waveform , sr = torchaudio . load ( speech_file )
+transcript = "i had that curiosity beside me at this moment" . split ()
+print ( waveform . shape , sr )
+
+assert waveform . ndim == 2
+assert waveform . shape [ 0 ] == 1
+assert sr == 16000
+
+
+The test wave is downloaded to:
+ $HOME/.cache/torch/hub/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
+
+
+
+
+ Wave filename
+ Content
+ Text
+
+
+ Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+ i had that curiosity beside me at this moment
+
+
+
We use the test model
+from CTC FORCED ALIGNMENT API TUTORIAL
+import torch
+
+bundle = torchaudio . pipelines . MMS_FA
+
+device = torch . device ( "cuda" if torch . cuda . is_available () else "cpu" )
+model = bundle . get_model ( with_star = False ) . to ( device )
+
+
+The model is downloaded to:
+ $HOME/.cache/torch/hub/checkpoints/model.pt
+
+
+
+
+Compute log_probs
+ with torch.inference_mode() :
+ emission, _ = model( waveform.to( device))
+ print( emission.shape)
+
+
+It should print:
+torch . Size ([ 1 , 169 , 28 ])
+
+
+
+
+Create token2id and id2token
+token2id = bundle . get_dict ( star = None )
+id2token = { i : t for t , i in token2id . items ()}
+token2id [ "<eps>" ] = 0
+del token2id [ "-" ]
+
+
+
+
+Create word2id and id2word
+words = list ( set ( transcript ))
+word2id = dict ()
+word2id [ 'eps' ] = 0
+for i , w in enumerate ( words ):
+ word2id [ w ] = i + 1
+
+id2word = { i : w for w , i in word2id . items ()}
+
+
+Note that we only use words from the transcript of the test wave.
+
+
+
+Convert transcript to an FST graph
+ egs/librispeech/ASR/local/prepare_lang_fst.py --lang-dir ./
+
+
+The above command should generate two files H.fst
and HL.fst
. We will
+use HL.fst
below:
+- rw - r -- r -- 1 root root 13 K Jun 12 08 : 28 H . fst
+- rw - r -- r -- 1 root root 3.7 K Jun 12 08 : 28 HL . fst
+
+
+
+
+Force aligner
+Now, everything is ready. We can use the following code to get forced alignments.
+from kaldi_decoder import DecodableCtc , FasterDecoder , FasterDecoderOptions
+import kaldifst
+
+def force_align ():
+ HL = kaldifst . StdVectorFst . read ( "./HL.fst" )
+ decodable = DecodableCtc ( emission [ 0 ] . contiguous () . cpu () . numpy ())
+ decoder_opts = FasterDecoderOptions ( max_active = 3000 )
+ decoder = FasterDecoder ( HL , decoder_opts )
+ decoder . decode ( decodable )
+ if not decoder . reached_final ():
+ print ( f "failed to decode xxx" )
+ return None
+ ok , best_path = decoder . get_best_path ()
+
+ (
+ ok ,
+ isymbols_out ,
+ osymbols_out ,
+ total_weight ,
+ ) = kaldifst . get_linear_symbol_sequence ( best_path )
+ if not ok :
+ print ( f "failed to get linear symbol sequence for xxx" )
+ return None
+
+ # We need to use i-1 here since we have incremented tokens during
+ # HL construction
+ alignment = [ i - 1 for i in isymbols_out ]
+ return alignment
+
+alignment = force_align ()
+
+for i , a in enumerate ( alignment ):
+ print ( i , id2token [ a ])
+
+
+The output should be identical to
+https://pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html#frame-level-alignments .
+For ease of reference, we list the output below:
+0 -
+1 -
+2 -
+3 -
+4 -
+5 -
+6 -
+7 -
+8 -
+9 -
+10 -
+11 -
+12 -
+13 -
+14 -
+15 -
+16 -
+17 -
+18 -
+19 -
+20 -
+21 -
+22 -
+23 -
+24 -
+25 -
+26 -
+27 -
+28 -
+29 -
+30 -
+31 -
+32 i
+33 -
+34 -
+35 h
+36 h
+37 a
+38 -
+39 -
+40 -
+41 d
+42 -
+43 -
+44 t
+45 h
+46 -
+47 a
+48 -
+49 -
+50 t
+51 -
+52 -
+53 -
+54 c
+55 -
+56 -
+57 -
+58 u
+59 u
+60 -
+61 -
+62 -
+63 r
+64 -
+65 i
+66 -
+67 -
+68 -
+69 -
+70 -
+71 -
+72 o
+73 -
+74 -
+75 -
+76 -
+77 -
+78 -
+79 s
+80 -
+81 -
+82 -
+83 i
+84 -
+85 t
+86 -
+87 -
+88 y
+89 -
+90 -
+91 -
+92 -
+93 b
+94 -
+95 e
+96 -
+97 -
+98 -
+99 -
+100 -
+101 s
+102 -
+103 -
+104 -
+105 -
+106 -
+107 -
+108 -
+109 -
+110 i
+111 -
+112 -
+113 d
+114 e
+115 -
+116 m
+117 -
+118 -
+119 e
+120 -
+121 -
+122 -
+123 -
+124 a
+125 -
+126 -
+127 t
+128 -
+129 t
+130 h
+131 -
+132 i
+133 -
+134 -
+135 -
+136 s
+137 -
+138 -
+139 -
+140 -
+141 m
+142 -
+143 -
+144 o
+145 -
+146 -
+147 -
+148 m
+149 -
+150 -
+151 e
+152 -
+153 n
+154 -
+155 t
+156 -
+157 -
+158 -
+159 -
+160 -
+161 -
+162 -
+163 -
+164 -
+165 -
+166 -
+167 -
+168 -
+
+
+To merge tokens, we use:
+from icefall.ctc import merge_tokens
+token_spans = merge_tokens ( alignment )
+for span in token_spans :
+ print ( id2token [ span . token ], span . start , span . end )
+
+
+The output is given below:
+i 32 33
+h 35 37
+a 37 38
+d 41 42
+t 44 45
+h 45 46
+a 47 48
+t 50 51
+c 54 55
+u 58 60
+r 63 64
+i 65 66
+o 72 73
+s 79 80
+i 83 84
+t 85 86
+y 88 89
+b 93 94
+e 95 96
+s 101 102
+i 110 111
+d 113 114
+e 114 115
+m 116 117
+e 119 120
+a 124 125
+t 127 128
+t 129 130
+h 130 131
+i 132 133
+s 136 137
+m 141 142
+o 144 145
+m 148 149
+e 151 152
+n 153 154
+t 155 156
+
+
+All of the code below is copied and modified
+from https://pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html .
+
+
+Segment each word using the computed alignments
+def unflatten ( list_ , lengths ):
+ assert len ( list_ ) == sum ( lengths )
+ i = 0
+ ret = []
+ for l in lengths :
+ ret . append ( list_ [ i : i + l ])
+ i += l
+ return ret
+
+
+word_spans = unflatten ( token_spans , [ len ( word ) for word in transcript ])
+print ( word_spans )
+
+
+The output is:
+[[ TokenSpan ( token = 2 , start = 32 , end = 33 )],
+ [ TokenSpan ( token = 15 , start = 35 , end = 37 ), TokenSpan ( token = 1 , start = 37 , end = 38 ), TokenSpan ( token = 13 , start = 41 , end = 42 )],
+ [ TokenSpan ( token = 7 , start = 44 , end = 45 ), TokenSpan ( token = 15 , start = 45 , end = 46 ), TokenSpan ( token = 1 , start = 47 , end = 48 ), TokenSpan ( token = 7 , start = 50 , end = 51 )],
+ [ TokenSpan ( token = 20 , start = 54 , end = 55 ), TokenSpan ( token = 6 , start = 58 , end = 60 ), TokenSpan ( token = 9 , start = 63 , end = 64 ), TokenSpan ( token = 2 , start = 65 , end = 66 ), TokenSpan ( token = 5 , start = 72 , end = 73 ), TokenSpan ( token = 8 , start = 79 , end = 80 ), TokenSpan ( token = 2 , start = 83 , end = 84 ), TokenSpan ( token = 7 , start = 85 , end = 86 ), TokenSpan ( token = 16 , start = 88 , end = 89 )],
+ [ TokenSpan ( token = 17 , start = 93 , end = 94 ), TokenSpan ( token = 3 , start = 95 , end = 96 ), TokenSpan ( token = 8 , start = 101 , end = 102 ), TokenSpan ( token = 2 , start = 110 , end = 111 ), TokenSpan ( token = 13 , start = 113 , end = 114 ), TokenSpan ( token = 3 , start = 114 , end = 115 )],
+ [ TokenSpan ( token = 10 , start = 116 , end = 117 ), TokenSpan ( token = 3 , start = 119 , end = 120 )],
+ [ TokenSpan ( token = 1 , start = 124 , end = 125 ), TokenSpan ( token = 7 , start = 127 , end = 128 )],
+ [ TokenSpan ( token = 7 , start = 129 , end = 130 ), TokenSpan ( token = 15 , start = 130 , end = 131 ), TokenSpan ( token = 2 , start = 132 , end = 133 ), TokenSpan ( token = 8 , start = 136 , end = 137 )],
+ [ TokenSpan ( token = 10 , start = 141 , end = 142 ), TokenSpan ( token = 5 , start = 144 , end = 145 ), TokenSpan ( token = 10 , start = 148 , end = 149 ), TokenSpan ( token = 3 , start = 151 , end = 152 ), TokenSpan ( token = 4 , start = 153 , end = 154 ), TokenSpan ( token = 7 , start = 155 , end = 156 )]
+]
+
+
+def preview_word ( waveform , spans , num_frames , transcript , sample_rate = bundle . sample_rate ):
+ ratio = waveform . size ( 1 ) / num_frames
+ x0 = int ( ratio * spans [ 0 ] . start )
+ x1 = int ( ratio * spans [ - 1 ] . end )
+ print ( f " { transcript } { x0 / sample_rate : .3f } - { x1 / sample_rate : .3f } sec" )
+ segment = waveform [:, x0 : x1 ]
+ return IPython . display . Audio ( segment . numpy (), rate = sample_rate )
+num_frames = emission . size ( 1 )
+
+
+preview_word ( waveform , word_spans [ 0 ], num_frames , transcript [ 0 ])
+preview_word ( waveform , word_spans [ 1 ], num_frames , transcript [ 1 ])
+preview_word ( waveform , word_spans [ 2 ], num_frames , transcript [ 2 ])
+preview_word ( waveform , word_spans [ 3 ], num_frames , transcript [ 3 ])
+preview_word ( waveform , word_spans [ 4 ], num_frames , transcript [ 4 ])
+preview_word ( waveform , word_spans [ 5 ], num_frames , transcript [ 5 ])
+preview_word ( waveform , word_spans [ 6 ], num_frames , transcript [ 6 ])
+preview_word ( waveform , word_spans [ 7 ], num_frames , transcript [ 7 ])
+preview_word ( waveform , word_spans [ 8 ], num_frames , transcript [ 8 ])
+
+
+The segmented wave of each word along with its time stamp is given below:
+
+
+ Word
+ Time
+ Wave
+
+
+ i
+ 0.644 - 0.664 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ had
+ 0.704 - 0.845 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ that
+ 0.885 - 1.026 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ curiosity
+ 1.086 - 1.790 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ beside
+ 1.871 - 2.314 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ me
+ 2.334 - 2.414 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ at
+ 2.495 - 2.575 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ this
+ 2.595 - 2.756 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
+ moment
+ 2.837 - 3.138 sec
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+
We repost the whole wave below for ease of reference:
+
+
+ Wave filename
+ Content
+ Text
+
+
+ Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
+
+
+
+ Your browser does not support the audio
element.
+
+
+
+ i had that curiosity beside me at this moment
+
+
+
+
+Summary
+Congratulations! You have succeeded in using the FST-based approach to
+compute alignment of a test wave.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/genindex.html b/genindex.html
index 68c5c180e..495feb476 100644
--- a/genindex.html
+++ b/genindex.html
@@ -48,6 +48,7 @@
Docker
Frequently Asked Questions (FAQs)
Model export
+FST-based forced alignment
+
FST-based forced alignment
+
diff --git a/installation/index.html b/installation/index.html
index 88f151281..c8c709904 100644
--- a/installation/index.html
+++ b/installation/index.html
@@ -74,6 +74,7 @@
Docker
Frequently Asked Questions (FAQs)
Model export
+
FST-based forced alignment
Recipes
diff --git a/model-export/export-model-state-dict.html b/model-export/export-model-state-dict.html
index aed7e897b..662824d68 100644
--- a/model-export/export-model-state-dict.html
+++ b/model-export/export-model-state-dict.html
@@ -64,6 +64,7 @@
Export to ncnn
+
FST-based forced alignment
Recipes
diff --git a/model-export/export-ncnn-conv-emformer.html b/model-export/export-ncnn-conv-emformer.html
index 7428faab8..a9edf6a86 100644
--- a/model-export/export-ncnn-conv-emformer.html
+++ b/model-export/export-ncnn-conv-emformer.html
@@ -72,6 +72,7 @@
+
FST-based forced alignment
Caution
-
Please use a more recent version of PyTorch. For instance, torch 1.8
-may not
work.
+
torch > 2.0
may not work. If you get errors while building pnnx, please switch
+to torch < 2.0
.
1. Download the pre-trained model
diff --git a/model-export/export-ncnn-lstm.html b/model-export/export-ncnn-lstm.html
index e2e2f8a24..9c59f785b 100644
--- a/model-export/export-ncnn-lstm.html
+++ b/model-export/export-ncnn-lstm.html
@@ -21,7 +21,7 @@
-
+
@@ -72,6 +72,7 @@
+FST-based forced alignment
Caution
-
Please use a more recent version of PyTorch. For instance, torch 1.8
-may not
work.
+
torch > 2.0
may not work. If you get errors while building pnnx, please switch
+to torch < 2.0
.
1. Download the pre-trained model
@@ -806,7 +807,7 @@ with int8
diff --git a/model-export/export-ncnn-zipformer.html b/model-export/export-ncnn-zipformer.html
index 809384dd2..548516559 100644
--- a/model-export/export-ncnn-zipformer.html
+++ b/model-export/export-ncnn-zipformer.html
@@ -71,6 +71,7 @@
+FST-based forced alignment
Caution
-
Please use a more recent version of PyTorch. For instance, torch 1.8
-may not
work.
+
torch > 2.0
may not work. If you get errors while building pnnx, please switch
+to torch < 2.0
.
1. Download the pre-trained model
diff --git a/model-export/export-ncnn.html b/model-export/export-ncnn.html
index c588ec055..07154b153 100644
--- a/model-export/export-ncnn.html
+++ b/model-export/export-ncnn.html
@@ -63,6 +63,7 @@
+FST-based forced alignment
Recipes
diff --git a/model-export/export-onnx.html b/model-export/export-onnx.html
index d84417fb6..7965543d8 100644
--- a/model-export/export-onnx.html
+++ b/model-export/export-onnx.html
@@ -65,6 +65,7 @@
Export to ncnn
+FST-based forced alignment
Recipes
diff --git a/model-export/export-with-torch-jit-script.html b/model-export/export-with-torch-jit-script.html
index 2cc3cb5d9..81beb80a4 100644
--- a/model-export/export-with-torch-jit-script.html
+++ b/model-export/export-with-torch-jit-script.html
@@ -63,6 +63,7 @@
Export to ncnn
+FST-based forced alignment
Recipes
diff --git a/model-export/export-with-torch-jit-trace.html b/model-export/export-with-torch-jit-trace.html
index bcbf85a9a..0870162ab 100644
--- a/model-export/export-with-torch-jit-trace.html
+++ b/model-export/export-with-torch-jit-trace.html
@@ -63,6 +63,7 @@
Export to ncnn
+FST-based forced alignment
Recipes
diff --git a/model-export/index.html b/model-export/index.html
index 36ec7e9ef..ff6e57332 100644
--- a/model-export/index.html
+++ b/model-export/index.html
@@ -58,6 +58,7 @@
Export to ncnn
+FST-based forced alignment
Recipes
diff --git a/objects.inv b/objects.inv
index 0a9fba1de..5233e738e 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/recipes/Finetune/adapter/finetune_adapter.html b/recipes/Finetune/adapter/finetune_adapter.html
index 2a3935259..3908bc82e 100644
--- a/recipes/Finetune/adapter/finetune_adapter.html
+++ b/recipes/Finetune/adapter/finetune_adapter.html
@@ -51,6 +51,7 @@
Docker
Frequently Asked Questions (FAQs)
Model export
+FST-based forced alignment
diff --git a/search.html b/search.html
index a8fe69e32..0d7424918 100644
--- a/search.html
+++ b/search.html
@@ -51,6 +51,7 @@
Docker
Frequently Asked Questions (FAQs)
Model export
+FST-based forced alignment
Recipes
diff --git a/searchindex.js b/searchindex.js
index 354ea262e..b95a7ae67 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["contributing/code-style", "contributing/doc", "contributing/how-to-create-a-recipe", "contributing/index", "decoding-with-langugage-models/LODR", "decoding-with-langugage-models/index", "decoding-with-langugage-models/rescoring", "decoding-with-langugage-models/shallow-fusion", "docker/index", "docker/intro", "faqs", "for-dummies/data-preparation", "for-dummies/decoding", "for-dummies/environment-setup", "for-dummies/index", "for-dummies/model-export", "for-dummies/training", "huggingface/index", "huggingface/pretrained-models", "huggingface/spaces", "index", "installation/index", "model-export/export-model-state-dict", "model-export/export-ncnn", "model-export/export-ncnn-conv-emformer", "model-export/export-ncnn-lstm", "model-export/export-ncnn-zipformer", "model-export/export-onnx", "model-export/export-with-torch-jit-script", "model-export/export-with-torch-jit-trace", "model-export/index", "recipes/Finetune/adapter/finetune_adapter", "recipes/Finetune/from_supervised/finetune_zipformer", "recipes/Finetune/index", "recipes/Non-streaming-ASR/aishell/conformer_ctc", "recipes/Non-streaming-ASR/aishell/index", "recipes/Non-streaming-ASR/aishell/stateless_transducer", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/index", "recipes/Non-streaming-ASR/librispeech/conformer_ctc", "recipes/Non-streaming-ASR/librispeech/distillation", "recipes/Non-streaming-ASR/librispeech/index", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi", "recipes/Non-streaming-ASR/timit/index", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/yesno/index", "recipes/Non-streaming-ASR/yesno/tdnn", "recipes/RNN-LM/index", "recipes/RNN-LM/librispeech/lm-training", "recipes/Streaming-ASR/index", "recipes/Streaming-ASR/introduction", "recipes/Streaming-ASR/librispeech/index", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/Streaming-ASR/librispeech/zipformer_transducer", "recipes/TTS/index", "recipes/TTS/ljspeech/vits", "recipes/TTS/vctk/vits", "recipes/index"], "filenames": ["contributing/code-style.rst", "contributing/doc.rst", "contributing/how-to-create-a-recipe.rst", "contributing/index.rst", "decoding-with-langugage-models/LODR.rst", "decoding-with-langugage-models/index.rst", "decoding-with-langugage-models/rescoring.rst", "decoding-with-langugage-models/shallow-fusion.rst", "docker/index.rst", "docker/intro.rst", "faqs.rst", "for-dummies/data-preparation.rst", "for-dummies/decoding.rst", "for-dummies/environment-setup.rst", "for-dummies/index.rst", "for-dummies/model-export.rst", "for-dummies/training.rst", "huggingface/index.rst", "huggingface/pretrained-models.rst", "huggingface/spaces.rst", "index.rst", "installation/index.rst", "model-export/export-model-state-dict.rst", "model-export/export-ncnn.rst", "model-export/export-ncnn-conv-emformer.rst", "model-export/export-ncnn-lstm.rst", "model-export/export-ncnn-zipformer.rst", "model-export/export-onnx.rst", "model-export/export-with-torch-jit-script.rst", "model-export/export-with-torch-jit-trace.rst", "model-export/index.rst", "recipes/Finetune/adapter/finetune_adapter.rst", "recipes/Finetune/from_supervised/finetune_zipformer.rst", "recipes/Finetune/index.rst", "recipes/Non-streaming-ASR/aishell/conformer_ctc.rst", "recipes/Non-streaming-ASR/aishell/index.rst", "recipes/Non-streaming-ASR/aishell/stateless_transducer.rst", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/index.rst", "recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst", "recipes/Non-streaming-ASR/librispeech/distillation.rst", "recipes/Non-streaming-ASR/librispeech/index.rst", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst", "recipes/Non-streaming-ASR/timit/index.rst", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/yesno/index.rst", "recipes/Non-streaming-ASR/yesno/tdnn.rst", "recipes/RNN-LM/index.rst", "recipes/RNN-LM/librispeech/lm-training.rst", "recipes/Streaming-ASR/index.rst", "recipes/Streaming-ASR/introduction.rst", "recipes/Streaming-ASR/librispeech/index.rst", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/Streaming-ASR/librispeech/zipformer_transducer.rst", "recipes/TTS/index.rst", "recipes/TTS/ljspeech/vits.rst", "recipes/TTS/vctk/vits.rst", "recipes/index.rst"], "titles": ["Follow the code style", "Contributing to Documentation", "How to create a recipe", "Contributing", "LODR for RNN Transducer", "Decoding with language models", "LM rescoring for Transducer", "Shallow fusion for Transducer", "Docker", "Introduction", "Frequently Asked Questions (FAQs)", "Data Preparation", "Decoding", "Environment setup", "Icefall for dummies tutorial", "Model Export", "Training", "Huggingface", "Pre-trained models", "Huggingface spaces", "Icefall", "Installation", "Export model.state_dict()", "Export to ncnn", "Export ConvEmformer transducer models to ncnn", "Export LSTM transducer models to ncnn", "Export streaming Zipformer transducer models to ncnn", "Export to ONNX", "Export model with torch.jit.script()", "Export model with torch.jit.trace()", "Model export", "Finetune from a pre-trained Zipformer model with adapters", "Finetune from a supervised pre-trained Zipformer model", "Fine-tune a pre-trained model", "Conformer CTC", "aishell", "Stateless Transducer", "TDNN-LSTM CTC", "Non Streaming ASR", "Conformer CTC", "Distillation with HuBERT", "LibriSpeech", "Pruned transducer statelessX", "TDNN-LSTM-CTC", "Zipformer CTC Blank Skip", "Zipformer MMI", "TIMIT", "TDNN-LiGRU-CTC", "TDNN-LSTM-CTC", "YesNo", "TDNN-CTC", "RNN-LM", "Train an RNN language model", "Streaming ASR", "Introduction", "LibriSpeech", "LSTM Transducer", "Pruned transducer statelessX", "Zipformer Transducer", "TTS", "VITS-LJSpeech", "VITS-VCTK", "Recipes"], "terms": {"we": [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61, 62], "us": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 23, 24, 25, 26, 27, 30, 31, 32, 34, 35, 36, 37, 39, 40, 43, 47, 48, 50, 52, 54, 60, 61], "tool": [0, 10, 21, 24], "make": [0, 1, 3, 24, 25, 26, 31, 34, 36, 39, 54], "consist": [0, 36, 42, 56, 57, 58], "possibl": [0, 2, 3, 34, 39], "black": 0, "format": [0, 24, 25, 26, 31, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "flake8": 0, "check": [0, 21, 39, 52, 60], "qualiti": [0, 35], "isort": 0, "sort": [0, 21, 52], "import": [0, 9, 10, 15, 21, 24, 57, 58], "The": [0, 1, 2, 4, 5, 7, 9, 10, 11, 12, 13, 14, 15, 16, 19, 21, 22, 24, 25, 26, 31, 32, 34, 35, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "version": [0, 9, 13, 15, 20, 21, 22, 24, 25, 26, 34, 36, 37, 39, 42, 43, 47, 48, 57], "abov": [0, 4, 6, 7, 10, 13, 15, 22, 24, 25, 26, 27, 34, 35, 36, 37, 39, 42, 44, 45, 50, 54, 56, 57, 58, 60], "ar": [0, 1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 16, 21, 22, 24, 25, 26, 31, 32, 33, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "22": [0, 9, 15, 21, 24, 25, 31, 39, 47, 48, 50], "3": [0, 4, 6, 7, 9, 10, 11, 15, 20, 22, 23, 27, 30, 31, 37, 40, 42, 43, 44, 45, 50, 52, 56, 57, 58, 60, 61], "0": [0, 1, 4, 6, 7, 9, 11, 13, 15, 20, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "5": [0, 7, 15, 23, 30, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60], "4": [0, 4, 5, 6, 7, 9, 10, 11, 13, 15, 20, 22, 23, 30, 31, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "10": [0, 7, 9, 15, 20, 21, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "1": [0, 4, 6, 7, 9, 11, 13, 15, 20, 22, 23, 27, 28, 29, 30, 31, 32, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "after": [0, 1, 6, 9, 11, 12, 13, 16, 19, 21, 22, 24, 25, 26, 31, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60], "run": [0, 2, 8, 10, 11, 13, 14, 15, 19, 20, 21, 24, 25, 26, 27, 30, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 61], "command": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13, 15, 16, 21, 22, 24, 25, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "git": [0, 4, 6, 7, 9, 13, 15, 21, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 43, 47, 48, 50, 52], "clone": [0, 4, 6, 7, 9, 13, 21, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 43, 47, 48, 50, 52], "http": [0, 1, 2, 4, 6, 7, 9, 10, 11, 13, 15, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "github": [0, 2, 6, 9, 11, 13, 15, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60], "com": [0, 2, 6, 9, 11, 13, 18, 19, 21, 22, 24, 25, 28, 29, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60], "k2": [0, 2, 9, 10, 13, 15, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 56, 57, 58, 60], "fsa": [0, 2, 9, 13, 15, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 34, 36, 39, 42, 44, 45, 56, 57, 58, 60], "icefal": [0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 22, 23, 27, 28, 29, 30, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61, 62], "cd": [0, 1, 2, 4, 6, 7, 9, 10, 11, 12, 13, 15, 16, 21, 22, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "pip": [0, 1, 6, 10, 13, 15, 21, 24, 27, 36, 60], "instal": [0, 1, 4, 6, 10, 14, 15, 17, 19, 20, 22, 23, 27, 30, 31, 32, 40, 42, 44, 45, 50, 56, 57, 58, 59], "pre": [0, 3, 4, 6, 7, 8, 9, 15, 17, 19, 20, 21, 23, 30, 40, 60, 62], "commit": [0, 21], "whenev": 0, "you": [0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "automat": [0, 14, 19, 40], "hook": 0, "invok": 0, "fail": 0, "If": [0, 2, 4, 6, 7, 8, 9, 10, 11, 13, 15, 19, 24, 25, 26, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "ani": [0, 4, 6, 7, 13, 21, 34, 36, 37, 39, 40, 42, 44, 45, 50, 56, 57], "your": [0, 1, 2, 4, 6, 7, 9, 11, 13, 17, 19, 20, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60], "wa": [0, 22, 39, 43], "success": [0, 21, 24, 25], "pleas": [0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 13, 14, 15, 19, 21, 23, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "fix": [0, 9, 10, 13, 24, 25, 26, 39], "issu": [0, 4, 6, 7, 10, 21, 24, 25, 39, 40, 57, 58], "report": [0, 9, 10, 31, 40], "some": [0, 1, 4, 6, 9, 22, 24, 25, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "i": [0, 1, 2, 4, 5, 7, 9, 10, 11, 12, 13, 14, 15, 16, 19, 21, 22, 23, 24, 25, 26, 27, 31, 32, 33, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60], "e": [0, 2, 4, 5, 6, 7, 13, 24, 25, 26, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "modifi": [0, 23, 30, 34, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58], "file": [0, 2, 9, 14, 15, 19, 20, 22, 24, 25, 26, 28, 29, 30, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 61], "place": [0, 21, 22, 36, 39, 43], "so": [0, 4, 6, 7, 9, 13, 19, 20, 21, 22, 24, 25, 26, 31, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "statu": 0, "failur": 0, "see": [0, 1, 6, 7, 9, 15, 19, 21, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58], "which": [0, 2, 4, 6, 7, 9, 11, 12, 15, 19, 21, 22, 24, 25, 26, 27, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 57, 58, 60], "ha": [0, 2, 20, 21, 23, 24, 25, 26, 27, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 54, 56, 57, 58], "been": [0, 21, 23, 24, 25, 26, 36], "befor": [0, 1, 11, 13, 15, 21, 22, 24, 25, 26, 27, 28, 31, 32, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "further": [0, 4, 6, 7, 15], "chang": [0, 4, 6, 7, 10, 21, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "all": [0, 9, 11, 13, 14, 18, 19, 22, 24, 25, 26, 28, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58], "again": [0, 24, 25, 50], "should": [0, 2, 4, 6, 11, 13, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60], "succe": 0, "thi": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61, 62], "time": [0, 21, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "succeed": 0, "want": [0, 4, 6, 7, 11, 13, 15, 21, 22, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60, 61], "can": [0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "do": [0, 2, 4, 6, 13, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60], "Or": 0, "without": [0, 4, 6, 7, 9, 15, 17, 19, 34, 39], "your_changed_fil": 0, "py": [0, 2, 4, 6, 7, 9, 10, 11, 12, 13, 15, 16, 21, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "sphinx": 1, "write": [1, 2, 3], "have": [1, 2, 4, 6, 7, 8, 9, 11, 13, 18, 19, 21, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "prepar": [1, 3, 4, 8, 14, 16, 20, 22, 33, 59], "environ": [1, 10, 11, 12, 14, 16, 20, 24, 25, 26, 31, 32, 34, 35, 36, 37, 39, 40, 42, 43, 47, 48, 50, 57, 58], "doc": [1, 22, 54], "r": [1, 13, 21, 24, 25, 26, 47, 48], "requir": [1, 4, 6, 11, 13, 15, 21, 26, 31, 32, 40, 52, 57, 58, 60, 61], "txt": [1, 4, 9, 11, 13, 15, 21, 22, 24, 25, 26, 27, 28, 29, 31, 34, 36, 37, 39, 43, 47, 48, 50, 52, 60, 61], "set": [1, 4, 6, 7, 10, 12, 13, 16, 21, 24, 25, 26, 31, 32, 33, 34, 36, 37, 39, 40, 42, 44, 45, 50, 52, 56, 57, 58], "up": [1, 21, 22, 24, 25, 26, 31, 34, 37, 39, 40, 42, 43, 44, 45, 57, 58], "readi": [1, 34, 39, 40, 52], "refer": [1, 2, 5, 6, 7, 11, 13, 15, 21, 22, 23, 24, 25, 26, 28, 29, 31, 34, 36, 37, 39, 42, 43, 44, 47, 48, 50, 52, 54, 57, 58, 60], "restructuredtext": 1, "primer": 1, "familiar": 1, "build": [1, 9, 15, 21, 22, 24, 25, 26, 34, 36, 39, 59], "local": [1, 9, 15, 21, 42, 44, 45, 52, 56, 57, 58], "preview": 1, "what": [1, 2, 11, 15, 21, 24, 25, 26, 36, 54, 60], "look": [1, 2, 4, 6, 7, 14, 18, 21, 24, 25, 26, 34, 36, 37, 39, 40], "like": [1, 2, 9, 11, 19, 24, 25, 26, 34, 36, 37, 39, 42, 44, 45, 50, 54, 56, 57], "publish": [1, 22, 35], "html": [1, 2, 10, 11, 13, 15, 21, 23, 24, 25, 26, 27, 28, 29, 42, 56, 57, 58, 60], "gener": [1, 6, 9, 14, 15, 22, 24, 25, 26, 27, 28, 29, 33, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58, 60, 61], "view": [1, 8, 20, 24, 25, 26, 34, 36, 37, 39, 42, 44, 45, 50, 56, 57, 58], "follow": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "python3": [1, 9, 10, 13, 15, 21, 25, 26], "m": [1, 15, 21, 24, 25, 26, 36, 42, 44, 45, 47, 48, 56, 57, 58], "server": [1, 19, 56], "It": [1, 2, 6, 7, 9, 11, 14, 15, 17, 21, 23, 24, 25, 26, 27, 28, 29, 31, 34, 35, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60, 61], "print": [1, 12, 16, 21, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "serv": [1, 42, 44, 45, 56, 57, 58], "port": [1, 14, 31, 32, 40, 42, 44, 45, 56, 57, 58], "8000": [1, 11, 15, 50], "open": [1, 4, 6, 7, 9, 20, 22, 24, 25, 26, 35, 36, 39, 40], "browser": [1, 17, 19, 42, 44, 45, 56, 57, 58], "go": [1, 7, 34, 36, 39, 42, 44, 45, 56, 57, 58, 60], "read": [2, 11, 15, 21, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "code": [2, 3, 8, 10, 13, 15, 20, 21, 24, 25, 26, 34, 39, 40, 42, 43, 47, 48, 50, 54, 57, 58], "style": [2, 3, 20], "adjust": [2, 52, 60, 61], "design": 2, "python": [2, 9, 13, 15, 21, 22, 24, 25, 26, 27, 28, 29, 34, 36, 39, 42, 44, 45, 52, 56, 57, 58, 60, 61], "recommend": [2, 6, 7, 9, 21, 31, 32, 34, 36, 37, 39, 40, 42, 57, 58], "test": [2, 4, 9, 15, 20, 22, 23, 30, 31, 32, 34, 36, 37, 39, 40, 43, 44, 47, 48, 52, 60, 61], "valid": [2, 21, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "dataset": [2, 10, 11, 13, 14, 21, 22, 31, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60, 61], "lhots": [2, 9, 11, 13, 15, 20, 22, 24, 25, 26, 34, 36, 39], "readthedoc": [2, 11, 21], "io": [2, 9, 11, 13, 15, 21, 23, 24, 25, 26, 27, 28, 29, 42, 56, 57, 58, 60], "en": [2, 11, 21, 24], "latest": [2, 9, 11, 13, 19, 21, 39, 40, 42, 43, 44, 45, 56, 57, 58], "index": [2, 21, 23, 24, 25, 26, 27, 28, 29, 56, 57, 58], "yesno": [2, 8, 10, 11, 12, 13, 14, 15, 16, 20, 21, 38, 50, 62], "veri": [2, 3, 7, 13, 24, 25, 26, 31, 32, 36, 47, 48, 50, 57, 58], "good": [2, 7], "exampl": [2, 11, 13, 19, 20, 22, 24, 25, 26, 28, 29, 30, 40, 43, 47, 48, 50], "speech": [2, 11, 13, 14, 19, 20, 21, 23, 33, 35, 36, 50, 60, 61, 62], "pull": [2, 4, 6, 7, 9, 24, 25, 26, 27, 31, 32, 34, 36, 39, 52, 54], "380": [2, 24, 48], "show": [2, 4, 6, 7, 9, 15, 19, 21, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "add": [2, 11, 24, 25, 26, 34, 36, 37, 57, 62], "new": [2, 3, 9, 13, 19, 21, 24, 25, 26, 31, 32, 33, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 50, 56, 57, 58], "suppos": [2, 9, 57, 58], "would": [2, 11, 22, 24, 25, 26, 39, 43, 57, 58], "name": [2, 9, 10, 13, 15, 22, 24, 25, 26, 27, 34, 36, 42, 44, 45, 52, 57, 58], "foo": [2, 29, 34, 39, 42, 44, 45, 56, 57, 58], "eg": [2, 9, 10, 11, 12, 15, 16, 18, 21, 22, 24, 25, 26, 27, 28, 29, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "mkdir": [2, 9, 24, 25, 34, 36, 37, 39, 43, 47, 48, 50], "p": [2, 4, 13, 21, 24, 25, 36, 47, 48], "asr": [2, 4, 6, 7, 9, 10, 11, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 62], "touch": 2, "sh": [2, 9, 11, 21, 22, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "chmod": 2, "x": [2, 4, 26, 54], "simpl": [2, 12, 14, 16, 21, 36, 52], "own": [2, 11, 31, 32, 40, 42, 52, 57, 58], "otherwis": [2, 24, 25, 26, 32, 34, 36, 39, 40, 42, 44, 45, 56, 57, 58], "librispeech": [2, 4, 6, 7, 10, 18, 20, 22, 24, 25, 26, 27, 28, 29, 31, 32, 38, 39, 40, 42, 43, 44, 45, 52, 53, 54, 56, 57, 58, 62], "assum": [2, 4, 15, 21, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 40, 42, 43, 47, 48, 50, 52, 56, 57, 58], "fanci": 2, "call": [2, 10, 27, 40, 52], "bar": [2, 29, 34, 39, 42, 44, 45, 56, 57, 58], "organ": 2, "wai": [2, 3, 15, 30, 42, 44, 45, 54, 56, 57, 58], "readm": [2, 34, 36, 37, 39, 43, 47, 48, 50], "md": [2, 18, 22, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "asr_datamodul": [2, 9, 10, 15, 21], "pretrain": [2, 4, 6, 7, 15, 22, 24, 25, 26, 27, 29, 31, 32, 34, 36, 37, 39, 43, 47, 48, 50, 59], "For": [2, 4, 6, 7, 9, 10, 14, 18, 21, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "instanc": [2, 9, 10, 12, 16, 18, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "tdnn": [2, 9, 10, 12, 15, 16, 21, 35, 38, 41, 46, 49], "its": [2, 4, 22, 23, 24, 25, 26, 29, 36, 44, 52], "directori": [2, 9, 11, 13, 20, 21, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "structur": [2, 26], "descript": [2, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "contain": [2, 8, 11, 13, 14, 15, 20, 22, 23, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 62], "inform": [2, 4, 6, 11, 12, 16, 21, 22, 34, 36, 37, 39, 42, 43, 44, 47, 48, 50, 54, 56, 57, 58], "g": [2, 4, 5, 6, 7, 11, 13, 21, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "wer": [2, 5, 9, 12, 15, 21, 22, 31, 32, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "etc": [2, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60], "provid": [2, 11, 15, 19, 21, 22, 23, 24, 25, 26, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 62], "pytorch": [2, 10, 13, 21, 24, 25, 26, 36], "dataload": [2, 21], "take": [2, 7, 9, 22, 40, 42, 50, 52, 57, 58, 60, 61], "input": [2, 22, 24, 25, 26, 34, 36, 37, 39, 43, 47, 48, 50, 54], "checkpoint": [2, 4, 6, 7, 12, 15, 21, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "save": [2, 15, 16, 21, 22, 25, 26, 28, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "dure": [2, 4, 5, 7, 10, 13, 19, 22, 31, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "stage": [2, 21, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "": [2, 4, 6, 7, 9, 14, 15, 16, 21, 22, 24, 25, 26, 27, 28, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "definit": [2, 24, 25], "neural": [2, 4, 6, 7, 31, 34, 39, 52], "network": [2, 31, 34, 36, 39, 42, 44, 45, 52, 56, 57, 58], "script": [2, 6, 7, 13, 14, 20, 21, 29, 30, 34, 36, 37, 39, 40, 43, 47, 48, 50, 52, 56], "infer": [2, 22, 24, 25, 59], "tdnn_lstm_ctc": [2, 37, 43, 48], "conformer_ctc": [2, 34, 39], "get": [2, 9, 13, 14, 15, 19, 21, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 50, 54, 56, 57, 58, 60], "feel": [2, 40, 52, 56], "result": [2, 4, 7, 9, 16, 18, 19, 22, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "everi": [2, 22, 42, 44, 45, 56, 57, 58], "kept": [2, 42, 57, 58], "self": [2, 23, 26, 54], "toler": 2, "duplic": 2, "among": [2, 21], "differ": [2, 9, 12, 21, 24, 25, 26, 27, 31, 32, 33, 34, 35, 39, 40, 42, 54, 56, 57, 58, 60], "invoc": [2, 24, 25], "help": [2, 12, 14, 16, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "blob": [2, 11, 18, 21, 22, 29, 42, 44, 45, 56, 57, 58], "master": [2, 6, 9, 11, 15, 18, 21, 22, 25, 26, 28, 29, 31, 32, 36, 40, 42, 44, 45, 56, 57, 58], "transform": [2, 6, 7, 34, 39, 56], "conform": [2, 28, 35, 36, 38, 41, 42, 44, 56, 57, 58], "base": [2, 4, 7, 13, 26, 31, 32, 34, 36, 37, 39, 40, 42, 44, 45, 52, 56, 57, 58], "lstm": [2, 23, 29, 30, 35, 38, 41, 46, 53, 55], "attent": [2, 26, 36, 37, 40, 54, 57, 58], "lm": [2, 4, 5, 7, 9, 11, 20, 21, 36, 42, 43, 47, 48, 50, 52, 57, 58, 62], "rescor": [2, 5, 20, 37, 43, 45, 47, 48, 50, 52], "demonstr": [2, 14, 15, 17, 19, 22, 27, 31], "consid": [2, 4, 26, 32], "colab": [2, 21], "notebook": [2, 21], "welcom": 3, "There": [3, 4, 15, 24, 25, 26, 27, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "mani": [3, 12, 21, 57, 58], "two": [3, 4, 11, 14, 15, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 61], "them": [3, 5, 6, 17, 18, 19, 24, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "To": [3, 4, 5, 6, 7, 9, 11, 15, 19, 21, 31, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "document": [3, 20, 22, 23, 24, 25, 26, 27, 45, 60], "repositori": [3, 9, 24, 25, 26, 27], "recip": [3, 4, 6, 7, 9, 11, 15, 18, 20, 21, 22, 27, 31, 32, 34, 36, 37, 39, 40, 42, 43, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "In": [3, 4, 6, 10, 15, 19, 21, 22, 24, 25, 26, 27, 28, 29, 30, 33, 34, 36, 37, 39, 40, 43, 47, 48, 50, 54], "page": [3, 19, 28, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 62], "describ": [3, 5, 8, 9, 17, 22, 24, 25, 27, 28, 29, 30, 34, 36, 37, 39, 42, 43, 47, 48, 57, 58, 60], "how": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 17, 19, 20, 21, 24, 25, 26, 27, 30, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "creat": [3, 4, 6, 7, 14, 15, 20, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57], "data": [3, 4, 6, 7, 8, 13, 14, 15, 16, 20, 22, 24, 25, 26, 27, 28, 29, 33, 35, 52, 59], "train": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 19, 20, 22, 23, 28, 29, 30, 51, 54, 59, 62], "decod": [3, 4, 8, 10, 11, 14, 15, 19, 20, 24, 25, 26, 29, 30, 32, 33, 52], "model": [3, 4, 6, 7, 9, 11, 12, 14, 17, 19, 20, 21, 23, 40, 51, 54, 59, 62], "As": [4, 5, 6, 7, 24, 36, 39, 40, 52], "type": [4, 6, 7, 9, 11, 15, 21, 22, 24, 25, 26, 34, 36, 39, 42, 44, 45, 50, 54, 56, 57, 58, 60], "e2": [4, 7, 21, 52], "usual": [4, 6, 7, 12, 34, 36, 37, 39, 40, 42, 44, 45, 52, 56, 57, 58, 60, 61], "an": [4, 5, 6, 7, 9, 11, 13, 15, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 34, 35, 36, 39, 40, 42, 45, 50, 51, 56, 57, 58, 60, 61, 62], "intern": [4, 5], "languag": [4, 7, 11, 19, 20, 34, 36, 37, 51, 60, 62], "learn": [4, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "level": [4, 5, 15, 52], "corpu": [4, 6, 7, 35, 52], "real": 4, "life": 4, "scenario": 4, "often": [4, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "mismatch": [4, 32, 57], "between": [4, 7, 31, 42, 57, 58], "target": [4, 19, 21, 31], "space": [4, 17, 20, 52], "problem": [4, 6, 7, 21, 40], "when": [4, 6, 9, 10, 15, 19, 24, 25, 26, 30, 36, 39, 40, 42, 44, 45, 52, 57, 58], "act": 4, "against": [4, 21], "extern": [4, 5, 6, 7], "tutori": [4, 5, 6, 7, 13, 15, 20, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 52, 56, 57, 58, 60, 61], "low": [4, 24, 25, 33, 60], "order": [4, 13, 21, 24, 25, 26, 34, 37, 39, 43, 47, 48], "densiti": 4, "ratio": 4, "allevi": 4, "effect": [4, 7, 26, 31], "improv": [4, 5, 6, 7, 31, 33, 36, 52], "perform": [4, 6, 7, 23, 31, 32, 33, 36, 40, 57], "languga": 4, "integr": [4, 19, 31], "pruned_transducer_stateless7_stream": [4, 6, 7, 26, 27, 58], "stream": [4, 6, 7, 15, 20, 23, 24, 25, 27, 30, 34, 39, 47, 48, 56, 62], "howev": [4, 6, 7, 22, 25, 33, 40], "easili": [4, 6, 7, 31, 34, 37, 39], "appli": [4, 6, 7, 36, 54], "other": [4, 7, 9, 13, 14, 15, 22, 25, 26, 27, 31, 36, 39, 40, 42, 43, 47, 48, 50, 54, 57, 58, 62], "encount": [4, 6, 7, 10, 21, 26, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "here": [4, 6, 7, 22, 24, 25, 26, 34, 36, 37, 39, 40, 43, 54, 57], "simplic": [4, 6, 7], "same": [4, 6, 7, 21, 22, 24, 25, 26, 31, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58], "domain": [4, 6, 7, 31, 32, 33], "gigaspeech": [4, 6, 7, 18, 28, 31, 32, 56], "first": [4, 6, 9, 10, 11, 21, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "let": [4, 6, 7, 14, 21, 24, 25, 26, 31, 32, 34, 39, 52], "background": 4, "predecessor": 4, "dr": 4, "propos": [4, 36, 54, 58], "address": [4, 9, 15, 19, 21, 22, 24, 25, 26, 36, 42, 45, 56, 57, 58], "sourc": [4, 11, 13, 21, 22, 24, 25, 26, 34, 35, 36, 39], "acoust": [4, 57, 58], "similar": [4, 5, 32, 40, 44, 57, 58], "deriv": 4, "formula": 4, "bay": 4, "theorem": 4, "text": [4, 6, 7, 11, 16, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "score": [4, 5, 7, 34, 39, 42, 57, 58], "left": [4, 24, 26, 31, 36, 57, 58], "y_u": 4, "mathit": 4, "y": 4, "right": [4, 24, 36, 54, 57], "log": [4, 9, 10, 12, 15, 16, 21, 24, 25, 26, 31, 43, 47, 48, 50, 60, 61], "y_": 4, "u": [4, 21, 24, 25, 26, 34, 36, 37, 39, 40, 50], "lambda_1": 4, "p_": 4, "lambda_2": 4, "where": [4, 9, 10, 57], "weight": [4, 15, 34, 37, 39, 44, 45, 52, 56], "respect": 4, "onli": [4, 6, 8, 11, 13, 14, 15, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60, 61], "compar": [4, 24, 25, 26, 32, 57], "shallow": [4, 5, 20, 52], "fusion": [4, 5, 20, 52], "subtract": [4, 5], "work": [4, 9, 13, 15, 24, 25, 26, 39], "treat": [4, 25, 26], "predictor": 4, "joiner": [4, 24, 25, 26, 27, 29, 31, 32, 36, 42, 56, 57, 58], "weak": 4, "captur": 4, "therefor": [4, 10], "n": [4, 5, 6, 11, 21, 34, 40, 42, 44, 45, 47, 48, 56, 57, 58], "gram": [4, 6, 21, 34, 36, 37, 42, 43, 45, 47, 48, 57, 58], "approxim": [4, 5], "ilm": 4, "lead": [4, 7, 12], "rnnt": [4, 42, 57, 58], "bi": [4, 6], "addit": [4, 33], "estim": 4, "li": 4, "choic": 4, "accord": [4, 52], "origin": [4, 5, 31, 32, 33], "paper": [4, 5, 31, 40, 42, 56, 57, 58, 60, 61], "achiev": [4, 6, 7, 31, 32, 52, 54], "both": [4, 32, 42, 44, 45, 54, 56, 57, 58], "intra": 4, "cross": 4, "much": [4, 24, 25, 31, 32], "faster": [4, 6, 31, 60], "evalu": 4, "now": [4, 6, 9, 13, 15, 21, 24, 25, 26, 34, 39, 40, 42, 43, 44, 45, 47, 48, 52, 56, 57, 58], "illustr": [4, 6, 7, 31, 32, 52], "purpos": [4, 6, 7, 24, 25, 31, 32, 52], "from": [4, 6, 7, 9, 10, 11, 13, 14, 15, 17, 19, 20, 21, 22, 24, 25, 26, 27, 33, 34, 35, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61, 62], "link": [4, 6, 7, 18, 21, 22, 23, 42, 44, 45, 56, 57, 58, 60, 61], "scratch": [4, 6, 7, 32, 42, 44, 45, 52, 56, 57, 58, 60, 61], "prune": [4, 6, 7, 22, 26, 27, 36, 38, 40, 41, 53, 54, 55, 56, 58], "statelessx": [4, 6, 7, 38, 40, 41, 53, 54, 55], "initi": [4, 6, 7, 9, 31, 32, 34, 37], "step": [4, 6, 7, 11, 14, 21, 22, 24, 25, 26, 32, 34, 36, 37, 39, 40, 42, 44, 45, 50, 52, 56, 57, 58], "download": [4, 6, 7, 8, 10, 13, 15, 19, 20, 23, 30, 31, 32, 35, 40, 52, 59], "git_lfs_skip_smudg": [4, 6, 7, 24, 25, 26, 27, 31, 32, 52], "huggingfac": [4, 6, 7, 13, 18, 20, 21, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 43, 44, 45, 47, 48, 50, 52, 56, 60, 61], "co": [4, 6, 7, 18, 19, 21, 22, 24, 25, 26, 27, 31, 32, 34, 35, 36, 37, 39, 43, 44, 45, 47, 48, 50, 52, 56, 60, 61], "zengwei": [4, 6, 7, 24, 26, 27, 31, 32, 45, 52, 56, 60], "stateless7": [4, 6, 7, 26, 27], "2022": [4, 6, 7, 22, 24, 25, 26, 27, 36, 42, 44, 45, 56, 57], "12": [4, 6, 7, 9, 14, 21, 22, 24, 25, 26, 27, 31, 34, 36, 37, 39, 42, 44, 45, 47, 50, 56, 57, 58, 60, 61], "29": [4, 6, 7, 21, 26, 27, 34, 36, 37, 39, 43, 44, 47, 48], "exp": [4, 6, 7, 9, 15, 16, 21, 22, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "lf": [4, 6, 7, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 43, 45, 47, 48, 50, 52], "includ": [4, 6, 7, 24, 25, 26, 27, 31, 32, 42, 44, 45, 52, 56, 57, 58], "pt": [4, 6, 7, 9, 11, 15, 21, 22, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "ln": [4, 6, 7, 9, 15, 22, 24, 25, 26, 27, 31, 32, 34, 39, 42, 44, 45, 52, 56, 57, 58], "epoch": [4, 6, 7, 9, 12, 15, 16, 21, 22, 24, 25, 26, 27, 28, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "99": [4, 6, 7, 15, 21, 24, 25, 26, 27, 31, 32], "symbol": [4, 5, 6, 7, 21, 36, 42, 57, 58], "load": [4, 6, 7, 9, 15, 21, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "lang_bpe_500": [4, 6, 7, 22, 24, 25, 26, 27, 28, 29, 31, 32, 39, 42, 44, 45, 52, 56, 57, 58], "bpe": [4, 5, 6, 7, 22, 24, 25, 26, 27, 29, 31, 32, 39, 42, 44, 45, 52, 56, 57, 58], "done": [4, 6, 7, 9, 13, 15, 21, 22, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "via": [4, 6, 7, 14, 21, 23, 28, 29, 30, 31, 32, 52], "exp_dir": [4, 6, 7, 9, 15, 21, 24, 25, 26, 36, 39, 40, 42, 44, 45, 57, 58], "avg": [4, 6, 7, 9, 12, 15, 21, 22, 24, 25, 26, 27, 28, 29, 31, 32, 36, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "averag": [4, 6, 7, 9, 12, 15, 21, 22, 24, 25, 26, 27, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "fals": [4, 6, 7, 9, 15, 21, 22, 24, 25, 26, 31, 32, 34, 36, 39, 40], "dir": [4, 6, 7, 22, 24, 25, 26, 27, 28, 29, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "max": [4, 6, 7, 21, 22, 24, 25, 31, 32, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58, 60, 61], "durat": [4, 6, 7, 11, 22, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "600": [4, 6, 7, 21, 22, 31, 39, 42, 44, 56, 57, 58], "chunk": [4, 6, 7, 24, 26, 27, 31, 57, 58], "len": [4, 6, 7, 26, 27, 58], "32": [4, 6, 7, 21, 24, 25, 26, 27, 31, 34, 36, 37, 58], "method": [4, 5, 7, 15, 19, 22, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 56, 57, 58], "modified_beam_search": [4, 5, 6, 7, 19, 36, 40, 42, 44, 56, 57, 58], "clean": [4, 9, 15, 21, 26, 31, 34, 36, 39, 40, 42, 43, 44, 45, 56, 57, 58], "beam_size_4": [4, 6, 7], "11": [4, 6, 7, 9, 10, 11, 15, 21, 24, 25, 27, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "best": [4, 5, 6, 7, 24, 25, 26, 31, 32, 34, 37, 39], "7": [4, 6, 7, 9, 21, 22, 23, 26, 30, 34, 37, 39, 42, 43, 47, 48, 56, 57], "93": [4, 6, 7, 15], "Then": [4, 6], "necessari": [4, 40, 52], "note": [4, 5, 6, 7, 10, 11, 15, 22, 24, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "960": [4, 31, 32, 39, 42, 44, 45, 56, 57, 58], "hour": [4, 13, 31, 32, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "ezerhouni": [4, 6, 7], "pushd": [4, 6, 7, 27], "popd": [4, 6, 7, 27], "marcoyang": [4, 6], "librispeech_bigram": [4, 6], "2gram": [4, 6], "fst": [4, 11, 21, 36, 50], "modified_beam_search_lm_lodr": 4, "lm_dir": [4, 6, 7, 9, 21, 39], "lm_scale": [4, 6, 7], "42": [4, 9, 15, 21, 25, 31, 34, 39, 50], "lodr_scal": 4, "24": [4, 9, 10, 13, 15, 21, 24, 25, 37, 43, 47, 48, 50], "modified_beam_search_lodr": [4, 5, 6], "scale": [4, 6, 7, 24, 25, 34, 39, 40, 43, 45, 47, 48], "embed": [4, 6, 7, 36, 42, 52, 56, 57, 58], "dim": [4, 6, 7, 24, 25, 26, 31, 36, 42, 52, 57], "2048": [4, 6, 7, 22, 24, 25, 26, 36, 52], "hidden": [4, 6, 7, 25, 52, 56], "num": [4, 6, 7, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 44, 45, 52, 56, 57, 58, 60, 61], "layer": [4, 6, 7, 24, 25, 26, 31, 36, 40, 42, 52, 54, 56, 57, 58], "vocab": [4, 6, 7, 39], "500": [4, 6, 7, 22, 24, 25, 26, 36, 39, 45, 56, 60, 61], "token": [4, 11, 22, 24, 25, 26, 27, 28, 29, 31, 34, 36, 37, 39, 43, 47, 48, 50, 52, 60, 61], "ngram": [4, 39, 43, 47, 48], "2": [4, 6, 7, 9, 11, 13, 15, 20, 22, 23, 30, 31, 32, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "extra": [4, 24, 25, 26, 36, 54, 57, 59], "argument": [4, 7, 15, 31, 32, 40, 54], "need": [4, 6, 11, 13, 14, 15, 19, 21, 22, 23, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58], "given": [4, 9, 11, 12, 13, 15, 21, 22, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 57, 58, 60, 61], "specifi": [4, 7, 10, 12, 15, 16, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "neg": [4, 36], "number": [4, 7, 16, 19, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "obtain": [4, 7, 34, 36, 37, 39, 43, 47, 48], "shown": [4, 7, 31], "below": [4, 7, 9, 11, 12, 13, 14, 15, 16, 21, 24, 25, 26, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 60], "61": [4, 6], "6": [4, 6, 7, 9, 10, 11, 15, 23, 30, 34, 36, 39, 42, 43, 47, 48, 56, 61], "74": [4, 6, 21, 22], "recal": 4, "lowest": [4, 12, 15, 42, 44, 45, 56, 57, 58], "77": [4, 6, 7, 21, 39], "08": [4, 6, 7, 9, 15, 26, 39, 43, 45, 47, 48, 50, 56], "inde": 4, "even": [4, 19, 21, 25], "better": [4, 6], "increas": [4, 6, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "8": [4, 6, 7, 9, 10, 15, 21, 22, 24, 25, 26, 31, 34, 36, 39, 40, 42, 43, 44, 45, 50, 56, 57, 58], "45": [4, 6, 15, 21, 24, 26, 34, 36, 39], "38": [4, 6, 21, 24, 34, 36, 39, 47], "23": [4, 6, 9, 10, 11, 15, 21, 24, 25, 26, 31, 34, 36, 37, 39, 47, 48, 50], "section": [5, 8, 9, 10, 17, 21, 22, 27, 28, 29, 30, 34, 39], "langugag": 5, "transduc": [5, 20, 22, 23, 27, 30, 31, 32, 35, 38, 40, 41, 52, 53, 54, 55], "rnn": [5, 6, 7, 20, 25, 36, 42, 44, 56, 57, 58, 62], "avail": [5, 6, 8, 15, 20, 21, 22, 24, 25, 26, 32, 33, 34, 36, 39, 43, 47, 48, 50, 56], "beam": [5, 22, 56], "search": [5, 6, 7, 18, 19, 59], "realli": [5, 34, 37, 39, 42, 44, 45, 56, 57, 58], "valu": [5, 7, 24, 25, 26, 31, 32, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "t": [5, 13, 14, 15, 21, 24, 25, 26, 27, 28, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "doe": [5, 15, 24, 25, 26, 34, 36, 39, 50], "modified_beam_search_lm_shallow_fus": [5, 6, 7], "interpol": 5, "also": [5, 6, 7, 11, 13, 14, 15, 17, 18, 21, 22, 23, 24, 25, 26, 27, 29, 31, 34, 36, 37, 39, 42, 44, 45, 50, 52, 54, 56, 57, 58, 60], "known": 5, "bigram": 5, "backoff": 5, "modified_beam_search_lm_rescor": [5, 6], "hypothes": [5, 6], "rnnlm": [5, 6, 52], "re": [5, 6, 10, 34, 37, 39, 40, 42, 44, 45, 54, 56, 57, 58], "rank": [5, 6], "modified_beam_search_lm_rescore_lodr": [5, 6], "lodr": [5, 20, 52], "commonli": [6, 7, 34, 36, 37, 39, 43, 47, 48, 50], "approach": 6, "incorpor": 6, "unlik": 6, "more": [6, 14, 21, 24, 25, 26, 31, 34, 39, 40, 50, 52, 54, 56, 57, 60, 61], "effici": [6, 7, 31, 42, 57, 58], "than": [6, 21, 22, 25, 31, 34, 36, 37, 39, 42, 43, 44, 45, 50, 56, 57, 58], "sinc": [6, 13, 21, 24, 25, 26, 32, 40, 50, 56], "less": [6, 22, 31, 39, 43, 50, 57, 58], "comput": [6, 15, 21, 22, 24, 25, 26, 34, 36, 37, 40, 42, 43, 45, 47, 48, 50, 56, 57, 58], "gpu": [6, 7, 8, 13, 14, 20, 21, 24, 25, 31, 32, 34, 36, 37, 39, 40, 42, 44, 45, 47, 48, 50, 56, 57, 58], "try": [6, 10, 12, 15, 17, 19, 40, 42, 44, 45, 56, 57, 58], "might": [6, 7, 25, 26, 57, 58], "ideal": [6, 7], "mai": [6, 7, 9, 21, 24, 25, 26, 32, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58, 62], "With": [6, 21], "43": [6, 9, 25, 26, 39], "great": 6, "made": [6, 24], "boost": [6, 7], "tabl": [6, 19, 24, 25, 26], "67": [6, 21], "59": [6, 15, 21, 24, 37, 39], "86": 6, "fact": 6, "arpa": [6, 11, 50], "performn": 6, "depend": [6, 14, 15, 21, 34, 39, 59], "kenlm": 6, "kpu": 6, "archiv": [6, 52], "zip": 6, "execut": [6, 7, 13, 24, 31, 34, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "9": [6, 9, 21, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 50, 56, 57, 58], "57": [6, 21, 25, 39, 43], "slightli": 6, "63": [6, 36], "04": [6, 24, 25, 26, 34, 36, 37, 39, 43, 47, 48], "52": [6, 21, 34, 39], "73": 6, "mention": [6, 54], "earlier": 6, "benchmark": [6, 36], "speed": [6, 24, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "132": 6, "95": [6, 35], "177": [6, 21, 22, 25, 26, 36, 37, 39], "96": [6, 21, 31], "210": [6, 47, 48], "262": [6, 7, 15], "62": [6, 7, 21, 39, 43], "65": [6, 7, 21, 24], "352": [6, 7, 39], "58": [6, 7, 10, 21, 39], "488": [6, 7, 24, 25, 26], "400": [6, 9, 35], "610": 6, "870": 6, "156": [6, 15], "203": [6, 15, 22, 39], "255": [6, 25, 26], "160": [6, 15], "263": [6, 9, 15, 21, 25], "singl": [6, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "32g": 6, "v100": [6, 34, 36, 37, 39], "vari": 6, "word": [7, 11, 12, 15, 34, 36, 37, 39, 43, 47, 48, 50, 52], "error": [7, 9, 10, 12, 13, 15, 21, 24, 25, 26, 39], "rate": [7, 12, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "These": [7, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "alreadi": [7, 11, 13, 21, 22, 33], "But": [7, 24, 42, 44, 45, 56, 57, 58], "long": [7, 24, 52, 60, 61], "true": [7, 9, 15, 21, 22, 24, 25, 26, 31, 32, 34, 36, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "either": [7, 15, 19, 34, 36, 37, 39, 57, 58], "choos": [7, 19, 21, 40, 42, 44, 45, 56, 57, 58], "three": [7, 15, 24, 25, 26, 29, 34, 36, 54], "associ": 7, "dimens": [7, 31, 42, 52, 57, 58], "obviou": 7, "rel": [7, 33], "reduct": [7, 15, 21, 24, 25, 44], "around": [7, 32], "A": [7, 14, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 56, 57, 58], "few": [7, 11, 24, 25, 26, 40], "paramet": [7, 14, 22, 24, 25, 26, 28, 31, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 52, 56, 57, 58, 60, 61], "tune": [7, 20, 24, 25, 26, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58, 62], "control": [7, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "too": 7, "small": [7, 31, 32, 33, 36, 47, 48, 50], "fulli": 7, "util": [7, 9, 10, 15, 21, 39], "larg": [7, 13], "domin": 7, "bad": 7, "typic": [7, 31, 34, 36, 37, 39], "activ": [7, 13, 19, 21], "path": [7, 9, 15, 19, 21, 22, 24, 25, 26, 29, 32, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "trade": 7, "off": [7, 24], "accuraci": [7, 24, 25, 33, 35], "larger": [7, 25, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "slower": 7, "built": [8, 9, 21, 60], "imag": [8, 20], "cpu": [8, 12, 13, 14, 15, 16, 20, 21, 22, 24, 25, 26, 28, 34, 42, 44, 45, 50, 57, 58, 60], "still": [8, 24, 25, 26, 33], "introduct": [8, 20, 53, 62], "tag": [8, 20], "cuda": [8, 10, 15, 20, 22, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 56, 57, 58], "enabl": [8, 21, 40], "within": [8, 14, 17, 19, 20, 24, 25], "updat": [8, 24, 25, 26, 31], "host": [9, 22], "hub": 9, "k2fsa": 9, "find": [9, 10, 16, 17, 18, 19, 22, 24, 25, 26, 29, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "dockerfil": 9, "tree": [9, 11, 28, 29, 34, 36, 37, 39, 43, 47, 48, 50, 56], "item": [9, 14], "curl": 9, "registri": 9, "v2": [9, 26, 34, 39], "jq": 9, "give": [9, 11, 15, 36], "someth": [9, 34, 36, 37, 39, 42, 44, 45, 50, 56, 57], "torch2": [9, 13, 15], "cuda12": 9, "cuda11": [9, 10, 21], "torch1": [9, 10, 21], "cuda10": 9, "13": [9, 10, 15, 21, 22, 24, 25, 26, 32, 36, 37, 39, 43, 44, 47], "releas": [9, 15, 21, 22, 24, 25, 26, 34, 36, 39, 60], "torch": [9, 10, 13, 14, 20, 22, 23, 30, 34, 36, 39], "select": [9, 12, 13, 14, 19, 21, 24, 25, 26, 42, 43, 47, 48, 50, 56, 57, 58], "appropri": [9, 21], "combin": [9, 12, 24, 25, 26], "visit": [9, 18, 19, 42, 44, 45, 56, 57, 58, 60, 61], "pkg": 9, "py3": [9, 10, 21], "v1": [9, 34, 37, 39, 43, 47, 48], "current": [9, 19, 24, 25, 36, 40, 54, 56, 57, 58, 60, 61, 62], "ghcr": 9, "alwai": [9, 21, 22], "sudo": [9, 34, 37], "rm": 9, "bin": [9, 13, 21, 24, 25, 26, 34, 39], "bash": 9, "start": [9, 11, 12, 14, 15, 16, 19, 21, 22, 26, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "interfac": 9, "present": [9, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "root": [9, 24, 25, 26, 52], "60c947eac59c": 9, "workspac": 9, "export": [9, 10, 11, 12, 13, 14, 16, 20, 21, 33, 34, 36, 37, 39, 40, 43, 47, 48, 50, 59], "pythonpath": [9, 11, 12, 13, 15, 16, 21, 24, 25, 26], "user": [9, 10, 13, 21], "copi": [9, 21, 54], "switch": [9, 21, 34, 39, 45], "opt": 9, "conda": [9, 10], "lib": [9, 10, 15, 21, 26], "site": [9, 10, 15, 21, 26], "packag": [9, 10, 15, 21, 26, 60, 61], "__init__": [9, 10, 15, 21, 22, 24, 25, 26, 34, 36, 39], "line": [9, 10, 11, 24, 25, 26, 42, 52, 57, 58], "modul": [9, 13, 20, 24, 26, 31, 44, 57], "_k2": [9, 10, 21], "determinizeweightpushingtyp": [9, 10], "importerror": [9, 20], "libcuda": 9, "cannot": [9, 20, 24, 25, 26], "share": [9, 20, 21], "object": [9, 20, 21, 34, 36, 37, 42, 50, 56, 57], "No": [9, 13, 20, 24, 25, 26, 50], "stub": 9, "list": [9, 15, 24, 25, 26, 34, 36, 37, 39, 43, 47, 48], "16": [9, 15, 21, 22, 24, 25, 26, 29, 31, 34, 36, 37, 39, 42, 43, 47, 48, 50, 56, 57, 58], "second": [9, 14, 34, 36, 37, 39, 40, 42, 44, 45, 50, 56, 57, 58], "2023": [9, 15, 21, 24, 25, 26, 31, 32, 44, 52, 61], "01": [9, 11, 15, 21, 24, 36, 37, 39, 40, 44], "02": [9, 11, 21, 22, 24, 25, 26, 31, 36, 39, 42, 48, 56, 57, 60], "06": [9, 15, 21, 22, 24, 31, 32, 37, 39, 43, 50], "info": [9, 15, 21, 22, 24, 25, 26, 31, 34, 36, 37, 39, 43, 47, 48, 50], "264": [9, 21, 26], "posixpath": [9, 15, 21, 24, 25, 26, 36, 39], "lang_dir": [9, 15, 21, 36, 39], "lang_phon": [9, 11, 15, 21, 37, 43, 47, 48, 50], "feature_dim": [9, 15, 21, 22, 24, 25, 26, 34, 36, 39, 50], "search_beam": [9, 15, 21, 34, 39, 50], "20": [9, 14, 15, 21, 22, 24, 26, 31, 32, 34, 36, 37, 39, 42, 43, 47, 48, 50, 52, 57], "output_beam": [9, 15, 21, 34, 39, 50], "min_active_st": [9, 15, 21, 34, 39, 50], "30": [9, 10, 15, 21, 24, 25, 26, 34, 36, 37, 39, 40, 42, 44, 45, 50, 56, 57, 58], "max_active_st": [9, 15, 21, 34, 39, 50], "10000": [9, 15, 21, 34, 39, 50], "use_double_scor": [9, 15, 21, 34, 39, 50], "14": [9, 10, 15, 21, 22, 24, 25, 28, 34, 39, 42, 43, 44, 47, 56, 57, 58], "feature_dir": [9, 15, 21, 39], "fbank": [9, 11, 15, 21, 22, 24, 25, 26, 34, 36, 37, 39, 43, 47, 48, 50], "max_dur": [9, 15, 21, 39], "bucketing_sampl": [9, 15, 21, 39], "num_bucket": [9, 15, 21, 39], "concatenate_cut": [9, 15, 21, 39], "duration_factor": [9, 15, 21, 39], "gap": [9, 15, 21, 39], "on_the_fly_feat": [9, 15, 21, 39], "shuffl": [9, 15, 21, 39], "return_cut": [9, 15, 21, 39], "num_work": [9, 15, 21, 39], "env_info": [9, 15, 21, 22, 24, 25, 26, 34, 36, 39], "sha1": [9, 15, 21, 22, 24, 25, 26, 34, 36, 39], "4c05309499a08454997adf500b56dcc629e35ae5": [9, 21], "date": [9, 15, 21, 22, 24, 25, 26, 34, 36, 39], "tue": [9, 21, 24, 39], "jul": [9, 15, 21], "25": [9, 15, 21, 22, 24, 25, 34, 39, 42, 47, 48, 50, 57], "36": [9, 21, 24, 36, 39, 40], "dev": [9, 10, 15, 21, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "7640d663": 9, "branch": [9, 15, 21, 22, 24, 25, 26, 34, 36, 39, 44], "375520d": 9, "fri": [9, 22], "28": [9, 21, 24, 25, 36, 39, 43, 60], "07": [9, 21, 24, 25, 26, 34, 36, 37, 39], "hostnam": [9, 15, 21, 22, 24, 25, 26, 36], "ip": [9, 15, 21, 22, 24, 25, 26, 36], "172": 9, "17": [9, 21, 22, 24, 25, 26, 34, 39, 47, 48, 56], "401": 9, "lexicon": [9, 11, 15, 21, 34, 36, 37, 39, 40, 42, 44, 45, 50, 56, 57, 58], "168": [9, 15, 21, 43], "compil": [9, 15, 21, 24, 25, 34, 36, 39], "linv": [9, 11, 15, 21, 36, 39, 50], "403": [9, 43], "273": [9, 15, 21, 22, 36], "devic": [9, 15, 21, 22, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 57, 58], "406": [9, 39], "291": [9, 21], "424": 9, "218": [9, 15, 21, 25], "about": [9, 11, 12, 14, 15, 16, 21, 24, 25, 26, 31, 36, 40, 42, 45, 56, 57, 58], "cut": [9, 15, 21, 39], "425": [9, 25, 39], "252": [9, 21], "504": 9, "204": [9, 21, 26, 39], "batch": [9, 15, 21, 24, 25, 26, 34, 36, 37, 39, 42, 44, 45, 52, 56, 57, 58], "process": [9, 15, 21, 22, 24, 25, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "until": [9, 15, 21, 39, 44], "w": [9, 21, 39, 47, 48], "nnpack": 9, "cpp": [9, 24, 28], "53": [9, 15, 21, 26, 34, 42, 43, 48, 56, 57], "could": [9, 24, 25, 26, 31, 32, 33, 34, 37, 52], "reason": [9, 14, 22, 24, 25, 26, 32, 57], "unsupport": 9, "hardwar": 9, "687": 9, "241": [9, 21, 34], "transcript": [9, 15, 21, 34, 35, 36, 37, 39, 42, 43, 47, 48, 56, 57, 58], "store": [9, 11, 15, 21, 39, 52], "recog": [9, 15, 21, 36, 39], "test_set": [9, 15, 21, 50], "688": 9, "564": [9, 15, 21], "240": [9, 15, 21, 34, 50], "ins": [9, 15, 21, 39, 50], "del": [9, 15, 21, 39, 50], "sub": [9, 15, 21, 39, 50], "690": 9, "249": [9, 21, 25], "wrote": [9, 15, 21, 39], "detail": [9, 11, 15, 21, 23, 27, 31, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 54, 56, 57, 58, 60, 61], "stat": [9, 15, 21, 39], "err": [9, 15, 21, 36, 39], "316": [9, 21, 39], "congratul": [9, 13, 21, 24, 25, 26, 34, 37, 39, 43, 47, 48, 50, 60], "finish": [9, 14, 34, 36, 37, 39, 40, 42, 43, 47, 48, 50, 57, 58], "successfulli": [9, 13, 21, 24, 25, 26, 60], "collect": [10, 13, 21, 52], "post": 10, "correspond": [10, 18, 19], "solut": 10, "One": 10, "torchaudio": [10, 13, 20, 54], "cu111": 10, "torchvis": 10, "f": [10, 13, 15, 21, 47, 48, 60], "org": [10, 13, 21, 35, 36, 42, 52, 56, 57, 58], "whl": [10, 13, 21], "torch_stabl": [10, 13, 21], "throw": [10, 24, 25, 26], "while": [10, 16, 21, 24, 25, 26, 31, 34, 36, 37, 39, 40, 42, 44, 45, 52, 56, 57, 58], "That": [10, 11, 14, 15, 16, 24, 25, 40, 42, 56, 57, 58], "cu11": 10, "correct": 10, "traceback": 10, "most": [10, 57, 58], "recent": [10, 24, 25, 26], "last": 10, "yesnoasrdatamodul": 10, "home": [10, 24, 25, 34, 39], "xxx": [10, 22, 24, 25, 26], "next": [10, 13, 14, 19, 21, 24, 25, 26, 39, 40, 42, 43, 44, 45, 52, 56, 57, 58], "gen": [10, 13, 14, 19, 21, 39, 40, 42, 43, 44, 45, 56, 57, 58], "kaldi": [10, 11, 13, 14, 19, 21, 39, 40, 42, 43, 44, 45, 56, 57, 58], "34": [10, 24, 25], "datamodul": 10, "add_eo": 10, "add_so": 10, "get_text": 10, "39": [10, 21, 24, 26, 36, 39, 43, 47], "tensorboard": [10, 16, 21, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "summarywrit": 10, "miniconda3": 10, "env": 10, "yyi": 10, "loosevers": 10, "uninstal": 10, "setuptool": [10, 13, 21], "yangyifan": 10, "anaconda3": 10, "dev20230112": 10, "linux": [10, 13, 14, 19, 21, 23, 24, 25, 26, 27], "x86_64": [10, 21, 24], "egg": 10, "handl": [10, 34, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "except": [10, 22], "anoth": 10, "occur": 10, "pruned_transducer_stateless7_ctc_b": [10, 44], "104": [10, 15, 21], "rais": 10, "anaconda": 10, "maco": [10, 13, 14, 19, 23, 24, 25, 26, 27], "probabl": [10, 36, 42, 44, 56, 57, 58], "variabl": [10, 12, 13, 16, 21, 24, 25, 26, 34, 37, 39, 40, 42, 44, 45, 56, 57, 58], "dyld_library_path": 10, "conda_prefix": 10, "locat": [10, 16, 24], "libpython": 10, "abl": 10, "insid": [10, 29], "codna_prefix": 10, "ld_library_path": 10, "setup": [11, 14, 20, 21, 24, 31, 32, 34, 36, 37, 39, 40, 42, 43, 47, 48, 50, 57, 58, 60, 61], "everyth": [11, 23], "tmp": [11, 12, 13, 15, 16, 21, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60], "each": [11, 15, 22, 24, 25, 27, 31, 34, 36, 37, 39, 42, 44, 45, 52, 54, 56, 57, 58], "exist": 11, "anyth": [11, 17, 19], "els": 11, "wonder": [11, 15], "url": [11, 34, 36, 37, 39, 42, 44, 45, 50, 56, 57], "varieti": 11, "folder": [11, 21, 22, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "wav": [11, 15, 22, 24, 25, 26, 27, 29, 34, 36, 37, 39, 42, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "scp": 11, "feat": 11, "put": [11, 13, 21, 24, 25, 44, 57], "l": [11, 21, 24, 25, 26, 36, 47, 48, 50], "waves_yesno": [11, 15, 21], "tar": [11, 21, 60], "gz": [11, 21, 52], "l41": 11, "extract": [11, 21, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "yesno_cuts_test": 11, "jsonl": [11, 22], "yesno_cuts_train": 11, "yesno_feats_test": 11, "lca": 11, "yesno_feats_train": 11, "hlg": [11, 15, 21, 43, 47, 48, 50], "l_disambig": [11, 50], "lexicon_disambig": [11, 50], "manifest": [11, 21, 31, 32, 40], "yesno_recordings_test": 11, "yesno_recordings_train": 11, "yesno_supervisions_test": 11, "yesno_supervisions_train": 11, "18": [11, 21, 24, 25, 26, 34, 36, 37, 39, 42, 43, 47, 48, 56, 57, 58], "thei": [11, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "idea": [11, 15, 54], "examin": 11, "relat": [11, 22, 31, 32, 34, 36, 39, 43, 47, 48, 50, 60, 61], "gunzip": 11, "c": [11, 21, 36, 37, 42, 44, 45, 50, 56, 57, 58, 60], "head": [11, 21, 31, 36, 54], "output": [11, 12, 13, 15, 22, 24, 25, 26, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60], "id": [11, 34, 37, 39, 43, 47, 48], "0_0_0_0_1_1_1_1": 11, "channel": [11, 19, 21, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "sampling_r": 11, "num_sampl": 11, "50800": 11, "35": [11, 21, 22, 24, 25, 26, 36, 39, 56], "channel_id": 11, "0_0_0_1_0_1_1_0": 11, "48880": 11, "0_0_1_0_0_1_1_0": 11, "48160": 11, "audio": [11, 21, 47, 48, 60], "l300": 11, "mean": [11, 14, 15, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58], "field": [11, 35], "per": [11, 36, 42, 57, 58], "recording_id": 11, "NO": [11, 15, 50], "ye": [11, 15, 50], "hebrew": [11, 50], "supervis": [11, 20, 33, 62], "l510": 11, "furthermor": [11, 36], "featur": [11, 21, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58], "compress": [11, 21], "lilcom": [11, 21], "cutset": [11, 32], "recordingset": 11, "supervisionset": 11, "featureset": 11, "num_fram": 11, "635": 11, "num_featur": 11, "frame_shift": 11, "storage_typ": 11, "lilcom_chunki": 11, "storage_path": 11, "storage_kei": 11, "13000": 11, "3570": 11, "record": [11, 19, 25, 26, 34, 35, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "monocut": 11, "611": 11, "16570": 11, "12964": 11, "2929": 11, "602": 11, "32463": 11, "12936": 11, "2696": 11, "actual": [11, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "separ": [11, 27, 52], "lang": [11, 21, 22, 36, 39, 45], "quit": [12, 14, 16, 33, 34, 36, 37, 39, 42, 44, 45, 52, 56, 57, 58], "cuda_visible_devic": [12, 16, 21, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "usag": [12, 15, 16, 22, 24, 25, 26, 28, 29, 43, 47, 48, 50, 59], "one": [12, 19, 22, 24, 25, 26, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60], "tini": [13, 14], "well": [13, 22, 31, 50, 62], "hundr": 13, "thousand": 13, "virtualenv": [13, 21], "icefall_env": [13, 15], "interpret": 13, "usr": 13, "prefix": [13, 22], "pkg_resourc": 13, "wheel": [13, 21, 24], "remeb": 13, "continu": [13, 15, 24, 25, 26, 27, 34, 36, 37, 39, 42, 44, 45, 50, 56, 57], "caution": [13, 34, 39], "matter": [13, 21, 24], "torchaduio": 13, "from_wheel": [13, 15, 21], "dev20231220": 13, "china": [13, 21, 35], "\u4e2d\u56fd\u56fd\u5185\u7528\u6237": [13, 21], "\u5982\u679c\u8bbf\u95ee\u4e0d\u4e86": [13, 21], "\u8bf7\u4f7f\u7528": [13, 21], "cn": [13, 21], "anytim": 13, "modulenotfounderror": 13, "don": [13, 14, 15, 21, 24, 25, 26, 28, 32, 34, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "walk": 14, "recognit": [14, 19, 20, 23, 24, 25, 33, 35, 36, 50, 62], "system": [14, 52], "out": [14, 40, 52], "minut": [14, 52], "sequenti": 14, "part": [14, 15, 19, 21, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58, 60, 61], "window": [14, 19, 23, 24, 25, 26, 27], "commun": 14, "appreci": 14, "virtual": 14, "curiou": 14, "quick": 14, "state_dict": [14, 20, 30, 34, 36, 37, 39, 43, 47, 48, 50], "jit": [14, 20, 23, 30, 39], "onnx": [14, 20, 22, 30, 31, 59, 61], "torchscript": [15, 23, 28, 29, 30], "trace": [15, 20, 23, 28, 30], "explain": 15, "kind": [15, 39, 42, 44, 45, 56, 57, 58], "produc": [15, 23, 42, 44, 45, 56, 57, 58], "03": [15, 21, 22, 25, 31, 36, 39, 47, 48, 56, 60], "912": [15, 22], "76": [15, 21, 50], "lr": [15, 21, 31, 32, 36, 56], "weight_decai": [15, 21], "1e": [15, 21], "start_epoch": [15, 21], "best_train_loss": [15, 21, 22, 24, 25, 26], "inf": [15, 21, 22, 24, 25, 26], "best_valid_loss": [15, 21, 22, 24, 25, 26], "best_train_epoch": [15, 21, 22, 24, 25, 26], "best_valid_epoch": [15, 21, 22, 25, 26], "batch_idx_train": [15, 21, 22, 24, 25, 26], "log_interv": [15, 21, 22, 24, 25, 26], "reset_interv": [15, 21, 22, 24, 25, 26], "valid_interv": [15, 21, 22, 24, 25, 26], "beam_siz": [15, 21, 22, 36], "sum": [15, 21], "913": 15, "950": 15, "971": [15, 48], "106": [15, 21, 25, 39], "Not": 15, "974": 15, "111": [15, 21, 39], "kei": [15, 24, 25, 26, 39], "bia": 15, "running_mean": 15, "running_var": 15, "num_batches_track": 15, "output_linear": 15, "48": [15, 21, 24, 25, 31, 34, 36], "089": 15, "090": 15, "ad79f1c699c684de9785ed6ca5edb805a41f78c3": 15, "wed": [15, 21, 24, 34, 36, 39], "26": [15, 21, 24, 25, 26, 36, 39, 48], "09": [15, 22, 25, 34, 36, 37, 39, 56], "aa073f6": 15, "none": [15, 21, 34, 39], "9a47c08": 15, "mon": [15, 25, 26], "aug": [15, 40], "50": [15, 21, 22, 24, 25, 26, 39, 42, 47, 56, 57, 58], "privat": 15, "fangjun": [15, 21, 22, 24, 25, 26, 36, 39], "macbook": 15, "pro": [15, 34, 39], "127": [15, 21, 24, 25, 50], "092": 15, "103": 15, "272": 15, "109": [15, 21, 34, 39], "112": [15, 24, 25, 26], "115": [15, 24, 25, 34, 39], "253": 15, "386": 15, "556": 15, "557": 15, "558": 15, "248": [15, 36], "559": 15, "315": [15, 24, 34, 36, 37, 39, 43], "ident": 15, "kaldifeat": 15, "csukuangfj": [15, 21, 22, 24, 25, 27, 34, 36, 37, 39, 43, 47, 48, 50, 56, 60], "dev20231221": 15, "0_0_0_1_0_0_0_1": [15, 50], "0_0_1_0_0_0_1_0": [15, 50], "19": [15, 22, 24, 25, 26, 31, 32, 34, 39, 43, 47, 48], "208": [15, 39], "136": [15, 39], "num_class": [15, 34, 39, 50], "sample_r": [15, 22, 34, 36, 39, 50], "words_fil": [15, 34, 39, 50], "sound_fil": [15, 22, 34, 36, 39, 50], "142": [15, 24, 34, 37, 39], "144": [15, 39], "212": 15, "213": [15, 50], "construct": [15, 22, 24, 25, 26, 34, 36, 37, 39, 43, 47, 48, 50], "170": [15, 43], "sound": [15, 22, 24, 25, 26, 29, 30, 34, 36, 37, 39, 43, 47, 48, 50], "224": 15, "176": [15, 24, 36, 39], "304": [15, 25], "214": [15, 36, 39], "47": [15, 21, 24, 25, 26, 32, 34, 39], "44": [15, 21, 24, 25, 31, 39, 47, 48], "666": 15, "667": 15, "670": 15, "677": [15, 24], "100": [15, 21, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "843": 15, "cpu_jit": [15, 28, 34, 39, 42, 44, 45, 57, 58], "confus": [15, 28], "move": [15, 28, 42, 44, 45, 57, 58], "map_loc": 15, "resid": 15, "default": [15, 24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "jit_pretrain": [15, 29, 44, 45, 56], "nn": [15, 36, 42, 44, 45, 56, 57, 58], "56": [15, 21, 24, 25, 39, 47], "00": [15, 21, 24, 34, 36, 37, 39, 43, 47, 48, 50], "603": 15, "121": [15, 43], "nn_model": [15, 34, 39], "129": [15, 37], "640": [15, 21, 26], "134": [15, 34], "641": 15, "138": [15, 34, 36], "148": [15, 31], "642": 15, "154": [15, 37], "727": 15, "190": [15, 43], "192": [15, 26, 31, 39], "export_onnx": 15, "onnxruntim": [15, 27], "888": [15, 34], "83": [15, 39, 43], "892": 15, "diagnost": 15, "verbos": 15, "warn": 15, "21": [15, 21, 22, 24, 31, 34, 36, 39, 47, 48], "047": [15, 36], "meta_data": 15, "model_typ": 15, "model_author": 15, "comment": 15, "non": [15, 20, 39, 54, 57, 62], "vocab_s": [15, 22, 24, 25, 26, 36], "049": 15, "140": [15, 21, 37], "int8": [15, 23, 30, 61], "quantiz": [15, 23, 30, 40], "075": 15, "onnx_quant": 15, "538": [15, 39], "tensor": [15, 21, 25, 26, 34, 36, 37, 39, 42, 50, 56, 57], "transpose_1_output_0": 15, "081": 15, "151": [15, 24], "float32": [15, 24, 25, 26], "onnx_pretrain": [15, 27], "260": [15, 26, 39], "166": 15, "171": [15, 21, 37, 39, 47, 48], "173": 15, "267": [15, 25, 36, 47, 48], "270": 15, "180": [15, 25, 34, 39], "279": [15, 39], "196": 15, "318": [15, 24, 25], "232": 15, "234": [15, 39], "deploi": [15, 27, 34, 39], "sherpa": [15, 19, 23, 28, 29, 30, 56, 59], "framework": [15, 19, 42, 57], "_": [15, 40], "ncnn": [15, 20, 30], "youtub": [17, 20, 39, 40, 42, 43, 44, 45, 56, 57, 58], "video": [17, 20, 39, 40, 42, 43, 44, 45, 56, 57, 58], "upload": [18, 19, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "specif": [18, 27, 36], "aishel": [18, 20, 34, 36, 37, 38, 62], "wenetspeech": [18, 28], "ipad": 19, "phone": 19, "screenshot": [19, 34, 36, 37, 39, 40, 42, 50, 56, 57], "chines": [19, 35, 36], "english": [19, 32, 50, 56], "greedi": 19, "click": [19, 21, 34, 36, 37, 39, 42, 44, 45, 50, 56, 57], "button": 19, "submit": 19, "wait": 19, "moment": 19, "bottom": [19, 42, 44, 45, 56, 57, 58], "subscrib": [19, 21, 39, 40, 42, 43, 44, 45, 56, 57, 58], "nadira": [19, 21, 39, 40, 42, 43, 44, 45, 56, 57, 58], "povei": [19, 21, 39, 40, 42, 43, 44, 45, 56, 57, 58], "www": [19, 21, 35, 39, 40, 42, 43, 44, 45, 52, 56, 57, 58], "uc_vaumpkminz1pnkfxan9mw": [19, 21, 39, 40, 42, 43, 44, 45, 56, 57, 58], "dummi": [20, 39], "toolkit": 20, "cudnn": 20, "docker": [20, 21], "frequent": 20, "ask": [20, 60], "question": 20, "faq": 20, "oserror": 20, "libtorch_hip": 20, "attributeerror": 20, "distutil": 20, "attribut": [20, 26, 39], "libpython3": 20, "timit": [20, 38, 47, 48, 62], "tt": [20, 60, 61, 62], "vit": [20, 59, 62], "ljspeech": [20, 59, 62], "vctk": [20, 59, 62], "fine": [20, 40, 62], "finetun": [20, 33, 62], "zipform": [20, 23, 27, 30, 33, 38, 41, 52, 53, 55, 62], "adapt": [20, 33, 62], "contribut": 20, "support": [21, 23, 24, 25, 26, 34, 36, 39, 42, 44, 45, 54, 56, 57, 58, 60, 61], "guid": 21, "suggest": [21, 32, 42, 44, 45, 56, 57, 58], "strongli": 21, "point": [21, 22, 34, 37, 39, 40, 42, 44, 45, 56, 57, 58], "sever": [21, 22, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 54, 56, 57, 58], "just": [21, 24, 25, 26, 52, 54], "kuangfangjun": [21, 24, 25, 26], "cpython3": 21, "final": [21, 22, 24, 25, 39, 43], "64": [21, 22, 24, 31, 36, 57], "9422m": 21, "creator": 21, "cpython3posix": 21, "dest": 21, "star": [21, 24, 25, 26], "fj": [21, 22, 24, 25, 26, 36, 39], "clear": 21, "no_vcs_ignor": 21, "global": 21, "seeder": 21, "fromappdata": 21, "bundl": 21, "app_data_dir": 21, "ad": [21, 24, 25, 26, 34, 36, 37, 39, 42, 44, 45, 50, 54, 56, 57, 58], "seed": 21, "bashactiv": 21, "cshellactiv": 21, "fishactiv": 21, "nushellactiv": 21, "powershellactiv": 21, "pythonactiv": 21, "determin": 21, "nvidia": [21, 34, 36, 37, 39], "smi": 21, "49": [21, 24, 25, 39, 48, 50], "510": 21, "driver": 21, "greater": 21, "our": [21, 24, 25, 26, 28, 29, 39, 40, 42, 54, 57, 58], "case": [21, 22, 24, 25, 26, 33, 42, 44, 45, 56, 57, 58], "verifi": 21, "nvcc": 21, "copyright": 21, "2005": 21, "2019": 21, "corpor": 21, "wed_oct_23_19": 21, "38_pdt_2019": 21, "v10": 21, "89": [21, 34], "cu116": 21, "compat": 21, "stabl": 21, "matrix": 21, "2bcu116": 21, "cp38": 21, "linux_x86_64": 21, "1983": 21, "mb": [21, 24, 25, 26], "________________________________________": 21, "gb": [21, 36], "764": 21, "kb": [21, 24, 25, 26, 47, 48], "eta": 21, "satisfi": 21, "extens": 21, "__version__": 21, "dev20230725": 21, "pypi": 21, "tuna": 21, "tsinghua": 21, "edu": 21, "resolv": 21, "main": [21, 34, 39, 54], "ubuntu": [21, 24, 25, 26], "2bcuda11": 21, "manylinux_2_17_x86_64": 21, "manylinux2014_x86_64": 21, "graphviz": 21, "cach": [21, 26], "de": [21, 22, 24, 25, 26, 36], "5e": 21, "fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa": 21, "o": 21, "cento": 21, "2009": 21, "core": 21, "cmake": [21, 24, 25, 34, 39], "27": [21, 24, 25, 26, 31, 32, 34, 36, 43, 48], "gcc": 21, "cmake_cuda_flag": 21, "wno": 21, "deprec": [21, 36], "lineinfo": 21, "expt": 21, "extend": 21, "lambda": 21, "use_fast_math": 21, "xptxa": 21, "gencod": 21, "arch": 21, "compute_35": 21, "sm_35": 21, "compute_50": 21, "sm_50": 21, "compute_60": 21, "sm_60": 21, "compute_61": 21, "sm_61": 21, "compute_70": 21, "sm_70": 21, "compute_75": 21, "sm_75": 21, "compute_80": 21, "sm_80": 21, "compute_86": 21, "sm_86": 21, "donnx_namespac": 21, "onnx_c2": 21, "compute_52": 21, "sm_52": 21, "xcudaf": 21, "diag_suppress": 21, "cc_clobber_ignor": 21, "integer_sign_chang": 21, "useless_using_declar": 21, "set_but_not_us": 21, "field_without_dll_interfac": 21, "base_class_has_different_dll_interfac": 21, "dll_interface_conflict_none_assum": 21, "dll_interface_conflict_dllexport_assum": 21, "implicit_return_from_non_void_funct": 21, "unsigned_compare_with_zero": 21, "declared_but_not_referenc": 21, "bad_friend_decl": 21, "relax": 21, "constexpr": 21, "d_glibcxx_use_cxx11_abi": 21, "option": [21, 23, 27, 30, 36, 40, 43, 47, 48, 50], "wall": 21, "strict": [21, 26, 35], "overflow": 21, "unknown": 21, "pragma": 21, "cmake_cxx_flag": 21, "unus": 21, "nvtx": 21, "disabl": [21, 22, 24, 25], "debug": 21, "sync": 21, "kernel": [21, 24, 26, 31, 36], "memori": [21, 24, 31, 34, 36, 39, 54], "alloc": 21, "214748364800": 21, "byte": [21, 24, 25, 26], "200": [21, 22, 24, 25, 26, 34, 39, 40, 47, 48, 50], "abort": 21, "__file__": 21, "cpython": [21, 24], "gnu": [21, 24], "req": 21, "vq12fd5i": 21, "filter": 21, "quiet": [21, 35], "7640d663469b22cd0b36f3246ee9b849cd25e3b7": 21, "metadata": [21, 47, 48], "pyproject": 21, "toml": 21, "cytoolz": 21, "3b": 21, "a7828d575aa17fb7acaf1ced49a3655aa36dad7e16eb7e6a2e4df0dda76f": 21, "33": [21, 24, 25, 34, 35, 36, 39, 47], "pyyaml": 21, "c8": 21, "6b": 21, "6600ac24725c7388255b2f5add93f91e58a5d7efaf4af244fdbcc11a541b": 21, "ma": 21, "nylinux_2_17_x86_64": 21, "736": 21, "dataclass": 21, "2f": 21, "1095cdc2868052dd1e64520f7c0d5c8c550ad297e944e641dbf1ffbb9a5d": 21, "dev0": 21, "7640d66": 21, "a8": 21, "df0a69c52bd085ca1ad4e5c4c1a5c680e25f9477d8e49316c4ff1e5084a4": 21, "linux_2_17_x86_64": 21, "87": [21, 24], "tqdm": 21, "e6": 21, "a2cff6306177ae6bc73bc0665065de51dfb3b9db7373e122e2735faf0d97": 21, "numpi": 21, "audioread": 21, "5d": 21, "cb": 21, "82a002441902dccbe427406785db07af10182245ee639ea9f4d92907c923": 21, "377": 21, "tabul": 21, "40": [21, 24, 25, 26, 37, 39, 43, 47, 48], "4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854": 21, "1a": 21, "70": 21, "e63223f8116931d365993d4a6b7ef653a4d920b41d03de7c59499962821f": 21, "97": [21, 24, 34], "ab": [21, 42, 56, 57, 58], "c3": 21, "57f0601a2d4fe15de7a553c00adbc901425661bf048f2a22dfc500caf121": 21, "intervaltre": 21, "fb": 21, "396d568039d21344639db96d940d40eb62befe704ef849b27949ded5c3bb": 21, "soundfil": 21, "bd": 21, "0602167a213d9184fc688b1086dc6d374b7ae8c33eccf169f9b50ce6568c": 21, "py2": 21, "46": [21, 25, 34, 39], "toolz": 21, "7f": 21, "5c": 21, "922a3508f5bda2892be3df86c74f9cf1e01217c2b1f8a0ac4841d903e3e9": 21, "55": [21, 24, 37, 39, 47], "sortedcontain": 21, "9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621c": 21, "cffi": 21, "b7": 21, "8b": 21, "06f30caa03b5b3ac006de4f93478dbd0239e2a16566d81a106c322dc4f79": 21, "15": [21, 22, 24, 25, 26, 31, 32, 36, 37, 39, 47, 50, 52], "442": 21, "pycpars": 21, "d5": 21, "5f610ebe421e85889f2e55e33b7f9a6795bd982198517d912eb1c76e1a53": 21, "118": [21, 39], "filenam": [21, 24, 25, 26, 27, 28, 29, 44, 45, 56, 58, 60, 61], "size": [21, 22, 24, 25, 26, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "687627": 21, "sha256": 21, "cbf0a4d2d0b639b33b91637a4175bc251d6a021a069644ecb1a9f2b3a83d072a": 21, "ephem": 21, "wwtk90_m": 21, "7a": 21, "8e": 21, "a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f": 21, "23704": 21, "5e2d3537c96ce9cf0f645a654c671163707bf8cb8d9e358d0e2b0939a85ff4c2": 21, "9c": 21, "f19ae5a03f8862d9f0776b0c0570f1fdd60a119d90954e3f39": 21, "26098": 21, "2604170976cfffe0d2f678cb1a6e5b525f561cd50babe53d631a186734fec9f9": 21, "f3": 21, "ed": 21, "2b": 21, "c179ebfad4e15452d6baef59737f27beb9bfb442e0620f7271": 21, "remot": 21, "enumer": 21, "12942": 21, "count": 21, "total": [21, 25, 26, 31, 34, 36, 37, 39, 40, 42, 43, 50, 56, 57], "delta": 21, "reus": 21, "pack": [21, 52, 57, 58], "12875": 21, "receiv": 21, "mib": 21, "8835": 21, "41": [21, 24, 26, 34, 36, 47, 50], "dl_dir": [21, 34, 37, 39, 40, 42, 44, 45, 56, 57, 58], "___________________________________________________": 21, "70m": 21, "1mb": 21, "718": 21, "compute_fbank_yesno": 21, "_______________________________________________________________________________": 21, "90": [21, 24], "82it": 21, "778": 21, "______________________________________________________________________________": 21, "256": [21, 26, 31, 47, 48], "92it": 21, "51": [21, 24, 34, 39, 50], "66": [21, 25, 32], "project": 21, "kaldilm": 21, "csrc": [21, 39], "arpa_file_pars": 21, "cc": 21, "void": 21, "arpafilepars": 21, "std": 21, "istream": 21, "79": 21, "92": [21, 39], "275": [21, 34], "compile_hlg": 21, "124": [21, 34, 39], "276": 21, "convert": [21, 24, 25, 26, 39], "309": 21, "ctc_topo": 21, "max_token_id": 21, "310": 21, "314": 21, "intersect": [21, 42, 57, 58], "323": 21, "lg": [21, 42, 45, 57, 58], "shape": [21, 26], "connect": [21, 22, 31, 39, 42, 43, 56, 57, 58], "68": [21, 39], "class": [21, 39], "71": [21, 39, 43], "341": 21, "rag": 21, "raggedtensor": 21, "remov": [21, 34, 36, 37, 39, 43, 47, 48], "disambigu": 21, "354": 21, "91": 21, "remove_epsilon": 21, "445": 21, "arc": 21, "compos": 21, "h": 21, "446": 21, "447": 21, "segment": 21, "fault": 21, "dump": 21, "protocol_buffers_python_implement": 21, "674": 21, "interest": [21, 40, 42, 44, 45, 56, 57, 58], "936": 21, "481": 21, "482": 21, "world_siz": [21, 40], "master_port": 21, "12354": 21, "num_epoch": 21, "3fb0a43": 21, "thu": [21, 22, 24, 25, 26, 36, 39, 43], "05": [21, 22, 24, 25, 31, 32, 34, 36, 37, 39, 48, 52, 61], "74279": [21, 22, 24, 25, 26, 36], "1220091118": 21, "57c4d55446": 21, "sph26": 21, "941": 21, "949": 21, "495": 21, "965": [21, 34], "146": 21, "244": 21, "967": 21, "149": [21, 24, 39], "199": [21, 39, 43], "singlecutsampl": 21, "205": [21, 39], "968": 21, "565": [21, 39], "422": 21, "loss": [21, 24, 25, 34, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "065": 21, "over": [21, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "2436": 21, "frame": [21, 31, 36, 42, 44, 57, 58], "tot_loss": 21, "681": [21, 24], "4561": 21, "2828": 21, "7076": 21, "22192": 21, "54": [21, 25, 26, 39, 43, 47, 48], "167": 21, "444": 21, "9002": 21, "18067": 21, "011": 21, "2555": 21, "2695": 21, "484": 21, "34971": 21, "331": [21, 24, 25, 39, 43], "4688": 21, "368": 21, "75": [21, 24], "633": 21, "2532": 21, "242": [21, 34, 39], "1139": 21, "1592": 21, "522": [21, 39], "1627": 21, "209": [21, 43], "07055": 21, "1175": 21, "07091": 21, "847": 21, "07731": 21, "427": [21, 25, 39], "04391": 21, "05341": 21, "884": 21, "04384": 21, "387": [21, 48], "03458": 21, "04616": 21, "707": [21, 34, 39], "03379": 21, "758": [21, 39], "433": [21, 39], "01054": 21, "980": [21, 39], "009014": 21, "009974": 21, "489": [21, 34], "01085": 21, "258": [21, 47, 48], "01172": 21, "01055": 21, "621": [21, 50], "01074": 21, "699": 21, "866": 21, "01044": 21, "844": 21, "008942": 21, "221": [21, 39], "01082": 21, "970": [21, 39], "01169": 21, "247": 21, "01073": 21, "326": [21, 25], "555": 21, "840": 21, "841": 21, "855": 21, "868": 21, "882": 21, "883": 21, "157": 21, "701": 21, "702": [21, 39], "704": [21, 34, 47], "fun": [21, 24, 25], "variou": [21, 27, 30, 62], "period": [22, 24], "disk": 22, "optim": [22, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "resum": [22, 31, 32, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "strip": 22, "reduc": [22, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "pruned_transducer_stateless3": [22, 28, 54], "almost": [22, 42, 54, 57, 58], "dict": [22, 26], "stateless3": [22, 24], "repo": [22, 27], "those": 22, "wave": [22, 24, 25, 26, 34, 39], "iter": [22, 24, 25, 26, 29, 42, 44, 45, 56, 57, 58], "1224000": 22, "greedy_search": [22, 31, 32, 36, 42, 44, 56, 57, 58], "test_wav": [22, 24, 25, 26, 27, 34, 36, 37, 39, 43, 47, 48, 50], "1089": [22, 24, 25, 26, 27, 39, 43], "134686": [22, 24, 25, 26, 27, 39, 43], "0001": [22, 24, 25, 26, 27, 39, 43], "1221": [22, 24, 25, 39, 43], "135766": [22, 24, 25, 39, 43], "0002": [22, 24, 25, 39, 43], "multipl": [22, 34, 36, 37, 39, 43, 47, 48, 50], "Its": [22, 24, 25, 26, 39], "233": [22, 24, 25], "265": 22, "3000": [22, 24, 25, 26], "80": [22, 24, 25, 26, 34, 36, 39], "subsampling_factor": [22, 25, 26, 34, 36, 39], "encoder_dim": [22, 24, 25, 26], "512": [22, 24, 25, 26, 31, 34, 36, 39], "nhead": [22, 24, 26, 34, 36, 39, 42, 57], "dim_feedforward": [22, 24, 25, 36], "num_encoder_lay": [22, 24, 25, 26, 36], "decoder_dim": [22, 24, 25, 26], "joiner_dim": [22, 24, 25, 26], "model_warm_step": [22, 24, 25], "4810e00d8738f1a21278b0156a42ff396a2d40ac": 22, "oct": [22, 39], "miss": [22, 24, 25, 26, 36, 39], "cu102": [22, 24, 25, 26], "1013": 22, "c39cba5": 22, "dirti": [22, 24, 25, 34, 39], "ceph": [22, 34, 36, 39], "0324160024": 22, "65bfd8b584": 22, "jjlbn": 22, "bpe_model": [22, 24, 25, 26, 39], "16000": [22, 34, 36, 37, 39, 43, 44, 47, 48], "max_context": 22, "max_stat": 22, "context_s": [22, 24, 25, 26, 36], "max_sym_per_fram": [22, 36], "simulate_stream": 22, "decode_chunk_s": 22, "left_context": 22, "dynamic_chunk_train": 22, "causal_convolut": 22, "short_chunk_s": [22, 26, 57, 58], "num_left_chunk": [22, 26], "blank_id": [22, 24, 25, 26, 36], "unk_id": 22, "271": [22, 25], "612": 22, "458": 22, "giga": [22, 25, 56], "623": 22, "277": 22, "78648040": 22, "951": [22, 39], "285": [22, 36, 39], "952": 22, "295": [22, 34, 36, 37, 39], "957": 22, "301": [22, 39], "700": 22, "329": [22, 25, 39], "388": 22, "earli": [22, 24, 25, 26, 39, 43], "nightfal": [22, 24, 25, 26, 39, 43], "THE": [22, 24, 25, 26, 39, 43], "yellow": [22, 24, 25, 26, 39, 43], "lamp": [22, 24, 25, 26, 39, 43], "light": [22, 24, 25, 26, 39, 43], "AND": [22, 24, 25, 26, 39, 43], "THERE": [22, 24, 25, 26, 39, 43], "squalid": [22, 24, 25, 26, 39, 43], "quarter": [22, 24, 25, 26, 39, 43], "OF": [22, 24, 25, 26, 39, 43], "brothel": [22, 24, 25, 26, 39, 43], "god": [22, 39, 43], "AS": [22, 39, 43], "direct": [22, 39, 43], "consequ": [22, 39, 43], "sin": [22, 39, 43], "man": [22, 39, 43], "punish": [22, 39, 43], "had": [22, 39, 43], "her": [22, 39, 43], "love": [22, 39, 43], "child": [22, 39, 43], "whose": [22, 36, 39, 43], "ON": [22, 24, 39, 43], "THAT": [22, 39, 43], "dishonor": [22, 39, 43], "bosom": [22, 39, 43], "TO": [22, 39, 43], "parent": [22, 39, 43], "forev": [22, 39, 43], "WITH": [22, 39, 43], "race": [22, 39, 43], "descent": [22, 39, 43], "mortal": [22, 39, 43], "BE": [22, 39, 43], "bless": [22, 39, 43], "soul": [22, 39, 43], "IN": [22, 39, 43], "heaven": [22, 39, 43], "yet": [22, 24, 25, 39, 43], "THESE": [22, 39, 43], "thought": [22, 39, 43], "affect": [22, 39, 43], "hester": [22, 39, 43], "prynn": [22, 39, 43], "hope": [22, 35, 39, 43], "apprehens": [22, 39, 43], "390": 22, "down": [22, 34, 39, 42, 44, 45, 56, 57, 58], "reproduc": [22, 39], "9999": [22, 44, 45, 56], "symlink": 22, "pass": [22, 26, 34, 36, 37, 39, 42, 44, 45, 54, 56, 57, 58], "convemform": [23, 30, 54], "platform": [23, 27], "android": [23, 24, 25, 26, 27, 60], "raspberri": [23, 27], "pi": [23, 27], "\u7231\u82af\u6d3e": 23, "maix": 23, "iii": 23, "axera": 23, "rv1126": 23, "static": 23, "binari": [23, 24, 25, 26, 34, 36, 37, 39, 42, 50, 56, 57, 60], "pnnx": [23, 30], "encod": [23, 27, 29, 30, 31, 34, 36, 37, 39, 42, 43, 44, 50, 54, 56, 57, 58], "conv": [24, 25], "emform": [24, 25, 28], "stateless2": [24, 25, 56], "pretrained_model": [24, 25, 26], "online_transduc": 24, "jit_xxx": [24, 25, 26], "anywher": [24, 25], "submodul": 24, "recurs": 24, "init": 24, "dcmake_build_typ": [24, 34, 39], "dncnn_python": 24, "dncnn_build_benchmark": 24, "dncnn_build_exampl": 24, "dncnn_build_tool": 24, "j4": 24, "pwd": 24, "src": [24, 26], "compon": [24, 54], "ncnn2int8": [24, 25], "am": 24, "sai": [24, 25, 26, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58], "later": [24, 25, 26, 34, 37, 39, 42, 43, 44, 45, 47, 48, 56, 57, 58], "termin": 24, "tencent": [24, 25], "modif": [24, 36], "offici": 24, "synchron": 24, "renam": [24, 25, 26], "conv_emformer_transducer_stateless2": [24, 54], "length": [24, 26, 36, 52, 57, 58], "cnn": [24, 26, 31], "31": [24, 25, 26, 31, 39], "context": [24, 31, 36, 42, 54, 56, 57, 58], "configur": [24, 26, 36, 40, 43, 47, 48, 50, 60, 61], "accordingli": [24, 25, 26], "yourself": [24, 25, 26, 40, 57, 58], "220": [24, 36, 37, 39], "229": [24, 34], "best_v": 24, "alid_epoch": 24, "subsampl": [24, 57, 58], "ing_factor": 24, "a34171ed85605b0926eebbd0463d059431f4f74a": 24, "dec": 24, "ver": 24, "ion": 24, "530e8a1": 24, "op": 24, "1220120619": [24, 25, 26], "7695ff496b": [24, 25, 26], "s9n4w": [24, 25, 26], "icefa": 24, "ll": 24, "transdu": 24, "cer": 24, "use_averaged_model": [24, 25, 26], "cnn_module_kernel": [24, 26], "left_context_length": 24, "chunk_length": 24, "right_context_length": 24, "memory_s": 24, "231": [24, 25, 26], "053": 24, "022": 24, "708": [24, 34, 36, 39, 50], "75490012": 24, "320": [24, 36], "682": 24, "lh": [24, 25, 26], "rw": [24, 25, 26], "289m": 24, "jan": [24, 25, 26], "289": 24, "roughli": [24, 25, 26], "equal": [24, 25, 26, 57, 58], "1024": [24, 25, 26, 31, 56], "287": [24, 50], "1010k": [24, 25], "decoder_jit_trac": [24, 25, 26, 29, 56, 58], "283m": 24, "encoder_jit_trac": [24, 25, 26, 29, 56, 58], "0m": [24, 25], "joiner_jit_trac": [24, 25, 26, 29, 56, 58], "sure": [24, 25, 26], "found": [24, 25, 26, 34, 36, 37, 39, 42, 44, 45, 50, 56, 57], "param": [24, 25, 26], "503k": [24, 25], "437": [24, 25, 26], "142m": 24, "79k": 24, "5m": [24, 25], "architectur": [24, 25, 26, 56], "editor": [24, 25, 26], "content": [24, 25, 26], "283": [24, 26], "1010": [24, 25], "503": [24, 25], "convers": [24, 25, 26], "half": [24, 25, 26, 42, 57, 58], "v": [24, 25, 26, 39, 47, 48], "float16": [24, 25, 26], "occupi": [24, 25, 26], "twice": [24, 25, 26], "smaller": [24, 25, 26, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "fp16": [24, 25, 26, 31, 32, 42, 44, 45, 52, 56, 57, 58, 60, 61], "won": [24, 25, 26, 27, 34, 37, 39, 40, 42, 44, 45, 56, 57, 58], "accept": [24, 25, 26], "216": [24, 34, 39, 47, 48], "encoder_param_filenam": [24, 25, 26], "encoder_bin_filenam": [24, 25, 26], "decoder_param_filenam": [24, 25, 26], "decoder_bin_filenam": [24, 25, 26], "joiner_param_filenam": [24, 25, 26], "joiner_bin_filenam": [24, 25, 26], "sound_filenam": [24, 25, 26], "141": 24, "328": 24, "336": 24, "106000": [24, 25, 26, 39, 43], "581": [24, 43], "381": 24, "7767517": [24, 25, 26], "1060": 24, "1342": 24, "in0": [24, 25, 26], "explan": [24, 25, 26], "magic": [24, 25, 26], "intermedi": [24, 25, 26], "increment": [24, 25, 26], "1061": 24, "sherpametadata": [24, 25, 26], "sherpa_meta_data1": [24, 25, 26], "newli": [24, 25, 26], "must": [24, 25, 26, 57], "eas": [24, 25, 26], "pair": [24, 25, 26], "sad": [24, 25, 26], "rememb": [24, 25, 26], "anymor": [24, 25, 26], "flexibl": [24, 25, 26, 31], "edit": [24, 25, 26], "arm": [24, 25, 26], "aarch64": [24, 25, 26], "onc": [24, 25], "mayb": [24, 25], "year": [24, 25], "_jit_trac": [24, 25], "fp32": [24, 25], "doubl": [24, 25], "j": [24, 25, 34, 39], "py38": [24, 25, 26], "arg": [24, 25], "wave_filenam": [24, 25], "16k": [24, 25], "hz": [24, 25, 47, 48], "mono": [24, 25], "calibr": [24, 25], "cat": [24, 25], "eof": [24, 25], "calcul": [24, 25, 44, 57, 58], "has_gpu": [24, 25], "config": [24, 25], "use_vulkan_comput": [24, 25], "88": [24, 36], "conv_87": 24, "942385": [24, 25], "threshold": [24, 25, 44], "938493": 24, "968131": 24, "conv_88": 24, "442448": 24, "549335": 24, "167552": 24, "conv_89": 24, "228289": 24, "001738": 24, "871552": 24, "linear_90": 24, "976146": 24, "101789": 24, "267128": 24, "linear_91": 24, "962030": 24, "162033": 24, "602713": 24, "linear_92": 24, "323041": 24, "853959": 24, "953129": 24, "linear_94": 24, "905416": 24, "648006": 24, "323545": 24, "linear_93": 24, "474093": 24, "200188": 24, "linear_95": 24, "888012": 24, "403563": 24, "483986": 24, "linear_96": 24, "856741": 24, "398679": 24, "524273": 24, "linear_97": 24, "635942": 24, "613655": 24, "590950": 24, "linear_98": 24, "460340": 24, "670146": 24, "398010": 24, "linear_99": 24, "532276": 24, "585537": 24, "119396": 24, "linear_101": 24, "585871": 24, "719224": 24, "205809": 24, "linear_100": 24, "751382": 24, "081648": 24, "linear_102": 24, "593344": 24, "450581": 24, "551147": 24, "linear_103": 24, "592681": 24, "705824": 24, "257959": 24, "linear_104": 24, "752957": 24, "980955": 24, "110489": 24, "linear_105": 24, "696240": 24, "877193": 24, "608953": 24, "linear_106": 24, "059659": 24, "643138": 24, "048950": 24, "linear_108": 24, "975461": 24, "589567": 24, "671457": 24, "linear_107": 24, "190381": 24, "515701": 24, "linear_109": 24, "710759": 24, "305635": 24, "082436": 24, "linear_110": 24, "531228": 24, "731162": 24, "159557": 24, "linear_111": 24, "528083": 24, "259322": 24, "211544": 24, "linear_112": 24, "148807": 24, "500842": 24, "087374": 24, "linear_113": 24, "592566": 24, "948851": 24, "166611": 24, "linear_115": 24, "437109": 24, "608947": 24, "642395": 24, "linear_114": 24, "193942": 24, "503904": 24, "linear_116": 24, "966980": 24, "200896": 24, "676392": 24, "linear_117": 24, "451303": 24, "061664": 24, "951344": 24, "linear_118": 24, "077262": 24, "965800": 24, "023804": 24, "linear_119": 24, "671615": 24, "847613": 24, "198460": 24, "linear_120": 24, "625638": 24, "131427": 24, "556595": 24, "linear_122": 24, "274080": 24, "888716": 24, "978189": 24, "linear_121": 24, "420480": 24, "429659": 24, "linear_123": 24, "826197": 24, "599617": 24, "281532": 24, "linear_124": 24, "396383": 24, "325849": 24, "335875": 24, "linear_125": 24, "337198": 24, "941410": 24, "221970": 24, "linear_126": 24, "699965": 24, "842878": 24, "224073": 24, "linear_127": 24, "775370": 24, "884215": 24, "696438": 24, "linear_129": 24, "872276": 24, "837319": 24, "254213": 24, "linear_128": 24, "180057": 24, "687883": 24, "linear_130": 24, "150427": 24, "454298": 24, "765789": 24, "linear_131": 24, "112692": 24, "924847": 24, "025545": 24, "linear_132": 24, "852893": 24, "116593": 24, "749626": 24, "linear_133": 24, "517084": 24, "024665": 24, "275314": 24, "linear_134": 24, "683807": 24, "878618": 24, "743618": 24, "linear_136": 24, "421055": 24, "322729": 24, "086264": 24, "linear_135": 24, "309880": 24, "917679": 24, "linear_137": 24, "827781": 24, "744595": 24, "915554": 24, "linear_138": 24, "422395": 24, "742882": 24, "402161": 24, "linear_139": 24, "527538": 24, "866123": 24, "849449": 24, "linear_140": 24, "128619": 24, "657793": 24, "266134": 24, "linear_141": 24, "839593": 24, "845993": 24, "021378": 24, "linear_143": 24, "442304": 24, "099039": 24, "889746": 24, "linear_142": 24, "325038": 24, "849592": 24, "linear_144": 24, "929444": 24, "618206": 24, "605080": 24, "linear_145": 24, "382126": 24, "321095": 24, "625010": 24, "linear_146": 24, "894987": 24, "867645": 24, "836517": 24, "linear_147": 24, "915313": 24, "906028": 24, "886522": 24, "linear_148": 24, "614287": 24, "908151": 24, "496181": 24, "linear_150": 24, "724932": 24, "485588": 24, "312899": 24, "linear_149": 24, "161146": 24, "606939": 24, "linear_151": 24, "164453": 24, "847355": 24, "719223": 24, "linear_152": 24, "086471": 24, "984121": 24, "222834": 24, "linear_153": 24, "099524": 24, "991601": 24, "816805": 24, "linear_154": 24, "054585": 24, "489706": 24, "286930": 24, "linear_155": 24, "389185": 24, "100321": 24, "963501": 24, "linear_157": 24, "982999": 24, "154796": 24, "637253": 24, "linear_156": 24, "537706": 24, "875190": 24, "linear_158": 24, "420287": 24, "502287": 24, "531588": 24, "linear_159": 24, "014746": 24, "423280": 24, "477261": 24, "linear_160": 24, "633553": 24, "715335": 24, "220921": 24, "linear_161": 24, "371849": 24, "117830": 24, "815203": 24, "linear_162": 24, "492933": 24, "126283": 24, "623318": 24, "linear_164": 24, "697504": 24, "825712": 24, "317358": 24, "linear_163": 24, "078367": 24, "008038": 24, "linear_165": 24, "023975": 24, "836278": 24, "577358": 24, "linear_166": 24, "860619": 24, "259792": 24, "493614": 24, "linear_167": 24, "380934": 24, "496160": 24, "107042": 24, "linear_168": 24, "691216": 24, "733317": 24, "831076": 24, "linear_169": 24, "723948": 24, "952728": 24, "129707": 24, "linear_171": 24, "034811": 24, "366547": 24, "665123": 24, "linear_170": 24, "356277": 24, "710501": 24, "linear_172": 24, "556884": 24, "729481": 24, "166058": 24, "linear_173": 24, "033039": 24, "207264": 24, "442120": 24, "linear_174": 24, "597379": 24, "658676": 24, "768131": 24, "linear_2": [24, 25], "293503": 24, "305265": 24, "877850": 24, "linear_1": [24, 25], "812222": 24, "766452": 24, "487047": 24, "linear_3": [24, 25], "999999": 24, "999755": 24, "031174": 24, "wish": [24, 25], "955k": 24, "18k": 24, "inparam": [24, 25], "inbin": [24, 25], "outparam": [24, 25], "outbin": [24, 25], "99m": 24, "78k": 24, "774k": [24, 25], "496": [24, 25, 39, 43], "replac": [24, 25], "774": [24, 25], "linear": [24, 25, 36], "convolut": [24, 25, 44, 54, 57], "exact": [24, 25], "4x": [24, 25], "comparison": 24, "468000": [25, 29, 56], "lstm_transducer_stateless2": [25, 29, 56], "862": 25, "222": [25, 37, 39], "865": 25, "is_pnnx": 25, "62e404dd3f3a811d73e424199b3408e309c06e1a": [25, 26], "6d7a559": [25, 26], "feb": [25, 26, 36], "147": [25, 26], "rnn_hidden_s": 25, "aux_layer_period": 25, "235": 25, "239": [25, 36], "472": 25, "595": 25, "324": 25, "83137520": 25, "596": 25, "325": 25, "257024": 25, "781812": 25, "327": 25, "84176356": 25, "182": [25, 26, 34, 43], "158": 25, "183": [25, 47, 48], "335": 25, "101": 25, "tracerwarn": [25, 26], "boolean": [25, 26], "caus": [25, 26, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "incorrect": [25, 26, 36], "flow": [25, 26], "constant": [25, 26], "futur": [25, 26, 36, 62], "need_pad": 25, "bool": 25, "259": [25, 34], "339": 25, "207": [25, 37, 39], "84": [25, 34], "324m": 25, "321": [25, 34], "107": [25, 43], "318m": 25, "159m": 25, "21k": 25, "159": [25, 39, 50], "37": [25, 34, 36, 39, 47], "861": 25, "266": [25, 26, 39, 43], "431": 25, "342": 25, "343": 25, "379": 25, "268": [25, 39, 43], "317m": 25, "317": 25, "conv_15": 25, "930708": 25, "972025": 25, "conv_16": 25, "978855": 25, "031788": 25, "456645": 25, "conv_17": 25, "868437": 25, "830528": 25, "218575": 25, "linear_18": 25, "107259": 25, "194808": 25, "293236": 25, "linear_19": 25, "193777": 25, "634748": 25, "401705": 25, "linear_20": 25, "259933": 25, "606617": 25, "722160": 25, "linear_21": 25, "186600": 25, "790260": 25, "512129": 25, "linear_22": 25, "759041": 25, "265832": 25, "050053": 25, "linear_23": 25, "931209": 25, "099090": 25, "979767": 25, "linear_24": 25, "324160": 25, "215561": 25, "321835": 25, "linear_25": 25, "800708": 25, "599352": 25, "284134": 25, "linear_26": 25, "492444": 25, "153369": 25, "274391": 25, "linear_27": 25, "660161": 25, "720994": 25, "674126": 25, "linear_28": 25, "415265": 25, "174434": 25, "007133": 25, "linear_29": 25, "038418": 25, "118534": 25, "724262": 25, "linear_30": 25, "072084": 25, "936867": 25, "259155": 25, "linear_31": 25, "342712": 25, "599489": 25, "282787": 25, "linear_32": 25, "340535": 25, "120308": 25, "701103": 25, "linear_33": 25, "846987": 25, "630030": 25, "985939": 25, "linear_34": 25, "686298": 25, "204571": 25, "607586": 25, "linear_35": 25, "904821": 25, "575518": 25, "756420": 25, "linear_36": 25, "806659": 25, "585589": 25, "118401": 25, "linear_37": 25, "402340": 25, "047157": 25, "162680": 25, "linear_38": 25, "174589": 25, "923361": 25, "030258": 25, "linear_39": 25, "178576": 25, "556058": 25, "807705": 25, "linear_40": 25, "901954": 25, "301267": 25, "956539": 25, "linear_41": 25, "839805": 25, "597429": 25, "716181": 25, "linear_42": 25, "178945": 25, "651595": 25, "895699": 25, "829245": 25, "627592": 25, "637907": 25, "746186": 25, "255032": 25, "167313": 25, "000000": 25, "999756": 25, "031013": 25, "345k": 25, "17k": 25, "218m": 25, "counterpart": 25, "bit": [25, 34, 36, 37, 39, 43, 50], "4532": 25, "feedforward": [26, 31, 36, 42, 57], "384": [26, 31, 39], "unmask": [26, 31], "downsampl": [26, 31, 35], "factor": [26, 31, 34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "473": [26, 39], "246": [26, 36, 39, 47, 48], "477": 26, "warm_step": 26, "2000": [26, 37], "feedforward_dim": 26, "attention_dim": [26, 34, 36, 39], "encoder_unmasked_dim": 26, "zipformer_downsampling_factor": 26, "decode_chunk_len": 26, "257": [26, 36, 47, 48], "023": 26, "zipformer2": 26, "419": 26, "At": [26, 34, 39], "stack": 26, "downsampling_factor": 26, "037": 26, "655": 26, "346": 26, "68944004": 26, "347": 26, "260096": 26, "348": [26, 47], "716276": 26, "656": [26, 39], "349": 26, "69920376": 26, "351": 26, "353": 26, "174": [26, 39], "175": 26, "1344": 26, "assert": 26, "cached_len": 26, "num_lay": 26, "1348": 26, "cached_avg": 26, "1352": 26, "cached_kei": 26, "1356": 26, "cached_v": 26, "1360": 26, "cached_val2": 26, "1364": 26, "cached_conv1": 26, "1368": 26, "cached_conv2": 26, "1373": 26, "left_context_len": 26, "1884": 26, "x_size": 26, "2442": 26, "2449": 26, "2469": 26, "2473": 26, "2483": 26, "kv_len": 26, "k": [26, 42, 47, 48, 56, 57, 58], "2570": 26, "attn_output": 26, "bsz": 26, "num_head": 26, "seq_len": 26, "head_dim": 26, "2926": 26, "lorder": 26, "2652": 26, "2653": 26, "embed_dim": 26, "2666": 26, "1543": 26, "in_x_siz": 26, "1637": 26, "1643": 26, "in_channel": 26, "1571": 26, "1763": 26, "src1": 26, "src2": 26, "1779": 26, "dim1": 26, "1780": 26, "dim2": 26, "_trace": 26, "958": 26, "tracer": 26, "instead": [26, 36, 57], "tupl": 26, "namedtupl": 26, "absolut": 26, "know": [26, 40], "side": 26, "allow": [26, 42, 57], "behavior": [26, 36], "_c": 26, "_create_method_from_trac": 26, "646": 26, "357": 26, "102": [26, 34], "embedding_out": 26, "686": 26, "361": [26, 39, 43], "735": 26, "69": 26, "269m": 26, "269": [26, 34, 47, 48], "725": [26, 43], "1022k": 26, "266m": 26, "8m": 26, "509k": 26, "133m": 26, "152k": 26, "4m": 26, "1022": 26, "133": 26, "509": 26, "360": 26, "365": 26, "280": [26, 39], "372": [26, 34], "state": [26, 34, 36, 37, 39, 42, 44, 45, 52, 56, 57, 58], "026": 26, "410": 26, "411": [26, 39], "2028": 26, "2547": 26, "2029": 26, "23316": 26, "23317": 26, "23318": 26, "23319": 26, "23320": 26, "amount": [26, 33, 35], "pad": [26, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "conv2dsubsampl": 26, "arrai": 26, "23300": 26, "element": 26, "repo_url": 27, "basenam": 27, "why": 28, "streaming_asr": [28, 29, 56, 57, 58], "conv_emform": 28, "offline_asr": [28, 42], "baz": 29, "compact": 31, "inject": 31, "competit": 31, "full": [31, 32, 39, 40, 42, 44, 45, 56, 57, 58], "subset": [31, 32, 39, 42, 44, 45, 56, 57, 58], "instruct": [31, 32], "intial": [31, 32], "decode_gigaspeech": [31, 32], "1000": [31, 32, 39, 60, 61], "insert": 31, "residu": 31, "zipformer2encoderlay": 31, "remain": 31, "untouch": 31, "experi": [31, 32, 34, 36, 37, 39, 40, 42, 44, 45, 50, 56, 57, 58], "do_finetun": [31, 32], "use_adapt": 31, "adapter_dim": 31, "zipformer_adapt": 31, "world": [31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 52, 56, 57, 58, 60, 61], "exp_giga_finetune_adapt": 31, "_adapter_dim": 31, "045": 31, "13022": 31, "ckpt": [31, 32], "certain": [31, 32, 33], "bottleneck": 31, "notic": 31, "trainal": 31, "2024": [31, 60], "808": [31, 39, 47], "1277": 31, "761344": 31, "trainabl": 31, "whole": [31, 32, 39, 43, 47, 48, 57, 58], "entir": 31, "deactiv": 31, "keep": [31, 36, 42, 57, 58], "768": 31, "1536": 31, "queri": 31, "po": 31, "causal": [31, 57], "128": [31, 39], "previou": [32, 52], "stateless": [32, 35, 38, 42, 56, 57, 58], "due": [32, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "vocabulari": [32, 36], "use_mux": 32, "exp_giga_finetun": 32, "_mux": 32, "0045": 32, "mux": 32, "13024": 32, "forget": 32, "quickli": 32, "mix": 32, "maintain": 32, "ones": 32, "lower": [32, 56], "public": 33, "capabl": 33, "high": [33, 35, 60], "label": 33, "1best": [34, 37, 39, 43, 44, 45, 47, 48], "automag": [34, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "stop": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "By": [34, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "musan": [34, 37, 39, 40, 42, 44, 45, 56, 57, 58], "apt": [34, 37], "permiss": [34, 37], "commandlin": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "multi": [34, 36, 37, 39, 40, 42, 44, 45, 54, 56, 57, 58], "machin": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "ddp": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "implement": [34, 36, 37, 39, 40, 42, 44, 45, 54, 56, 57, 58], "utter": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "oom": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "decai": [34, 37, 39, 44, 45, 56], "warmup": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "function": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "get_param": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "directli": [34, 36, 37, 39, 40, 42, 44, 45, 56, 57, 58], "perturb": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "3x150": [34, 36, 37], "450": [34, 36, 37], "visual": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "logdir": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "labelsmooth": 34, "tensorflow": [34, 36, 37, 39, 42, 44, 45, 50, 56, 57], "press": [34, 36, 37, 39, 42, 44, 45, 50, 56, 57, 58], "ctrl": [34, 36, 37, 39, 42, 44, 45, 50, 56, 57, 58], "engw8ksktzqs24zbv5dgcg": 34, "2021": [34, 37, 39, 43, 47, 48, 50], "22t11": 34, "scan": [34, 36, 37, 39, 42, 50, 56, 57], "116068": 34, "scalar": [34, 36, 37, 39, 42, 50, 56, 57], "listen": [34, 36, 37, 42, 50, 56, 57], "xxxx": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "saw": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "consol": [34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "avoid": [34, 36, 39], "nbest": [34, 39, 45], "lattic": [34, 37, 39, 42, 43, 47, 48, 57, 58], "uniqu": [34, 39, 42, 57, 58], "pkufool": [34, 37, 43], "icefall_asr_aishell_conformer_ctc": 34, "transcrib": [34, 36, 37, 39], "lang_char": [34, 36], "bac009s0764w0121": [34, 36, 37], "bac009s0764w0122": [34, 36, 37], "bac009s0764w0123": [34, 36, 37], "tran": [34, 37, 39, 43, 47, 48], "graph": [34, 37, 39, 42, 43, 47, 48, 57, 58], "conveni": [34, 37, 39, 40], "eo": [34, 37, 39], "soxi": [34, 36, 37, 39, 43, 50], "sampl": [34, 36, 37, 39, 43, 44, 50, 57, 58], "precis": [34, 36, 37, 39, 42, 43, 50, 57, 58], "67263": [34, 36, 37], "cdda": [34, 36, 37, 39, 43, 50], "sector": [34, 36, 37, 39, 43, 50], "135k": [34, 36, 37], "256k": [34, 36, 37, 39], "sign": [34, 36, 37, 39, 50], "integ": [34, 36, 37, 39, 50], "pcm": [34, 36, 37, 39, 50], "65840": [34, 36, 37], "308": [34, 36, 37], "625": [34, 36, 37], "132k": [34, 36, 37], "64000": [34, 36, 37], "300": [34, 36, 37, 39, 40, 42, 52, 57], "128k": [34, 36, 37, 50], "displai": [34, 36, 37, 39], "topologi": [34, 39], "num_decoder_lay": [34, 39], "vgg_frontend": [34, 36, 39], "use_feat_batchnorm": [34, 39], "f2fd997f752ed11bbef4c306652c433e83f9cf12": 34, "sun": 34, "sep": 34, "33cfe45": 34, "d57a873": 34, "nov": [34, 39], "hw": 34, "kangwei": 34, "icefall_aishell3": 34, "k2_releas": 34, "tokens_fil": 34, "num_path": [34, 39, 42, 57, 58], "ngram_lm_scal": [34, 39], "attention_decoder_scal": [34, 39], "nbest_scal": [34, 39], "sos_id": [34, 39], "eos_id": [34, 39], "4336": [34, 36], "131": [34, 39], "293": [34, 39], "369": [34, 39], "\u751a": [34, 36], "\u81f3": [34, 36], "\u51fa": [34, 36], "\u73b0": [34, 36], "\u4ea4": [34, 36], "\u6613": [34, 36], "\u51e0": [34, 36], "\u4e4e": [34, 36], "\u505c": [34, 36], "\u6b62": 34, "\u7684": [34, 36, 37], "\u60c5": [34, 36], "\u51b5": [34, 36], "\u4e00": [34, 36], "\u4e8c": [34, 36], "\u7ebf": [34, 36, 37], "\u57ce": [34, 36], "\u5e02": [34, 36], "\u867d": [34, 36], "\u7136": [34, 36], "\u4e5f": [34, 36, 37], "\u5904": [34, 36], "\u4e8e": [34, 36], "\u8c03": [34, 36], "\u6574": [34, 36], "\u4e2d": [34, 36, 37], "\u4f46": [34, 36, 37], "\u56e0": [34, 36], "\u4e3a": [34, 36], "\u805a": [34, 36], "\u96c6": [34, 36], "\u4e86": [34, 36, 37], "\u8fc7": [34, 36], "\u591a": [34, 36], "\u516c": [34, 36], "\u5171": [34, 36], "\u8d44": [34, 36], "\u6e90": [34, 36], "371": 34, "683": 34, "684": [34, 50], "651": [34, 50], "654": 34, "659": 34, "752": 34, "887": 34, "340": 34, "370": 34, "\u751a\u81f3": [34, 37], "\u51fa\u73b0": [34, 37], "\u4ea4\u6613": [34, 37], "\u51e0\u4e4e": [34, 37], "\u505c\u6b62": 34, "\u60c5\u51b5": [34, 37], "\u4e00\u4e8c": [34, 37], "\u57ce\u5e02": [34, 37], "\u867d\u7136": [34, 37], "\u5904\u4e8e": [34, 37], "\u8c03\u6574": [34, 37], "\u56e0\u4e3a": [34, 37], "\u805a\u96c6": [34, 37], "\u8fc7\u591a": [34, 37], "\u516c\u5171": [34, 37], "\u8d44\u6e90": [34, 37], "recor": [34, 39], "highest": [34, 39], "966": 34, "821": 34, "822": 34, "826": 34, "916": 34, "345": 34, "889": 34, "limit": [34, 36, 39, 54, 57], "upgrad": [34, 39], "NOT": [34, 36, 39, 50], "checkout": [34, 39], "hlg_decod": [34, 39], "four": [34, 39], "messag": [34, 39, 42, 44, 45, 56, 57, 58], "use_gpu": [34, 39], "word_tabl": [34, 39], "forward": [34, 39, 44], "cu": [34, 39], "int": [34, 39], "char": [34, 39], "98": 34, "150": [34, 39], "693": [34, 47], "165": [34, 39], "nnet_output": [34, 39], "185": [34, 39, 50], "217": [34, 39], "mandarin": 35, "beij": 35, "shell": 35, "technologi": 35, "ltd": 35, "peopl": 35, "accent": 35, "area": 35, "invit": 35, "particip": 35, "conduct": 35, "indoor": 35, "fidel": 35, "microphon": 35, "16khz": 35, "manual": 35, "through": 35, "profession": 35, "annot": 35, "inspect": 35, "free": [35, 40, 52, 56], "academ": 35, "moder": 35, "research": 35, "openslr": [35, 52], "ctc": [35, 38, 41, 45, 46, 49], "conv1d": [36, 42, 56, 57, 58], "tanh": 36, "borrow": 36, "ieeexplor": 36, "ieee": 36, "stamp": 36, "jsp": 36, "arnumb": 36, "9054419": 36, "predict": [36, 40, 42, 56, 57, 58], "charact": 36, "unit": 36, "87939824": 36, "optimized_transduc": 36, "technqiu": 36, "end": [36, 42, 44, 45, 50, 56, 57, 58, 60, 61], "maximum": 36, "emit": 36, "simplifi": [36, 54], "significantli": 36, "degrad": 36, "exactli": 36, "unprun": 36, "advantag": 36, "minim": 36, "pruned_transducer_stateless": [36, 42, 54, 57], "altern": 36, "though": 36, "transducer_stateless_modifi": 36, "pr": 36, "ram": 36, "tri": 36, "prob": [36, 56], "219": [36, 39], "lagz6hrcqxoigbfd5e0y3q": 36, "03t14": 36, "8477": 36, "250": [36, 43], "sym": [36, 42, 57, 58], "beam_search": [36, 42, 57, 58], "decoding_method": 36, "beam_4": 36, "ensur": 36, "poor": 36, "531": [36, 37], "994": [36, 39], "027": 36, "encoder_out_dim": 36, "f4fefe4882bc0ae59af951da3f47335d5495ef71": 36, "50d2281": 36, "mar": 36, "0815224919": 36, "75d558775b": 36, "mmnv8": 36, "72": [36, 39], "878": [36, 48], "880": 36, "891": 36, "113": [36, 39], "userwarn": 36, "__floordiv__": 36, "round": 36, "toward": 36, "trunc": 36, "floor": 36, "div": 36, "b": [36, 39, 47, 48], "rounding_mod": 36, "divis": 36, "x_len": 36, "163": [36, 39], "\u6ede": 36, "322": 36, "759": 36, "760": 36, "919": 36, "922": 36, "929": 36, "046": 36, "319": [36, 39], "798": 36, "831": [36, 48], "215": [36, 39, 43], "402": 36, "topk_hyp_index": 36, "topk_index": 36, "logit": 36, "583": [36, 48], "lji9mwuorlow3jkdhxwk8a": 37, "13t11": 37, "4454": 37, "icefall_asr_aishell_tdnn_lstm_ctc": 37, "858": [37, 39], "389": [37, 39], "161": [37, 39], "536": 37, "539": 37, "917": 37, "\u505c\u6ede": 37, "mmi": [38, 41], "blank": [38, 41], "skip": [38, 40, 41, 42, 56, 57, 58], "distil": [38, 41], "hubert": [38, 41], "ligru": [38, 46], "libri": [39, 40, 42, 44, 45, 56, 57, 58], "3x960": [39, 42, 44, 45, 56, 57, 58], "2880": [39, 42, 44, 45, 56, 57, 58], "lzgnetjwrxc3yghnmd4kpw": 39, "24t16": 39, "4540": 39, "sentenc": [39, 52], "piec": 39, "And": [39, 42, 44, 45, 56, 57, 58], "neither": 39, "nor": 39, "5000": 39, "033": 39, "537": 39, "full_libri": [39, 40], "464": 39, "548": 39, "776": 39, "652": [39, 50], "109226120": 39, "714": [39, 47], "206": 39, "944": 39, "1328": 39, "443": [39, 43], "2563": 39, "494": 39, "592": 39, "1715": 39, "52576": 39, "1424": 39, "807": 39, "506": 39, "362": 39, "1477": 39, "2922": 39, "4295": 39, "52343": 39, "396": 39, "3584": 39, "432": 39, "680": [39, 47], "_pickl": 39, "unpicklingerror": 39, "invalid": 39, "hlg_modifi": 39, "g_4_gram": [39, 43, 47, 48], "sentencepiec": 39, "875": [39, 43], "212k": 39, "267440": [39, 43], "1253": [39, 43], "535k": 39, "77200": [39, 43], "154k": 39, "554": 39, "7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4": 39, "8d93169": 39, "601": 39, "025": 39, "broffel": 39, "osom": 39, "723": 39, "775": 39, "881": 39, "571": 39, "857": 39, "979": 39, "055": 39, "117": 39, "051": 39, "363": 39, "959": [39, 48], "546": 39, "598": 39, "599": [39, 43], "833": 39, "834": 39, "915": 39, "076": 39, "110": 39, "397": 39, "999": [39, 42, 57, 58], "concaten": 39, "bucket": 39, "sampler": 39, "ctc_decod": 39, "ngram_lm_rescor": 39, "attention_rescor": 39, "105": 39, "125": [39, 50], "228": 39, "543": 39, "topo": 39, "547": 39, "729": 39, "703": 39, "545": 39, "122": 39, "126": 39, "135": [39, 50], "153": [39, 50], "945": 39, "475": 39, "191": [39, 47, 48], "398": 39, "515": 39, "deseri": 39, "441": 39, "fsaclass": 39, "loadfsa": 39, "const": 39, "string": 39, "c10": 39, "ignor": 39, "589": 39, "attention_scal": 39, "162": 39, "169": [39, 47, 48], "188": 39, "984": 39, "624": 39, "519": [39, 48], "632": 39, "645": [39, 50], "243": 39, "303": 39, "179": 39, "knowledg": 40, "vector": 40, "mvq": 40, "kd": 40, "pruned_transducer_stateless4": [40, 42, 54, 57], "theoret": 40, "applic": 40, "minor": 40, "stop_stag": [40, 60, 61], "thing": 40, "distillation_with_hubert": 40, "Of": 40, "cours": 40, "xl": 40, "proce": 40, "960h": [40, 44], "use_extracted_codebook": 40, "augment": 40, "th": [40, 47, 48], "embedding_lay": 40, "num_codebook": 40, "under": [40, 52], "vq_fbank_layer36_cb8": 40, "whola": 40, "snippet": 40, "echo": 40, "awk": 40, "split": 40, "pruned_transducer_stateless6": 40, "12359": 40, "spec": 40, "warp": 40, "paid": 40, "suitabl": [42, 56, 57, 58], "pruned_transducer_stateless2": [42, 54, 57], "pruned_transducer_stateless5": [42, 54, 57], "scroll": [42, 44, 45, 56, 57, 58], "arxiv": [42, 56, 57, 58], "2206": [42, 56, 57, 58], "13236": [42, 56, 57, 58], "rework": [42, 54, 57], "daniel": [42, 57, 58], "joint": [42, 56, 57, 58], "contrari": [42, 56, 57, 58], "convent": [42, 56, 57, 58], "recurr": [42, 56, 57, 58], "2x": [42, 57, 58], "littl": [42, 57], "436000": [42, 44, 45, 56, 57, 58], "438000": [42, 44, 45, 56, 57, 58], "qogspbgsr8kzcrmmie9jgw": 42, "20t15": [42, 56, 57], "4468": [42, 56, 57], "210171": [42, 56, 57], "access": [42, 44, 45, 56, 57, 58], "googl": [42, 44, 45, 56, 57, 58], "6008": [42, 44, 45, 56, 57, 58], "localhost": [42, 44, 45, 56, 57, 58], "expos": [42, 44, 45, 56, 57, 58], "proxi": [42, 44, 45, 56, 57, 58], "bind_al": [42, 44, 45, 56, 57, 58], "fast_beam_search": [42, 44, 56, 57, 58], "474000": [42, 56, 57, 58], "largest": [42, 57, 58], "posterior": [42, 44, 57, 58], "algorithm": [42, 57, 58], "pdf": [42, 45, 57, 58], "1211": [42, 57, 58], "3711": [42, 57, 58], "espnet": [42, 57, 58], "net": [42, 57, 58], "beam_search_transduc": [42, 57, 58], "basic": [42, 57], "topk": [42, 57, 58], "expand": [42, 57, 58], "mode": [42, 57, 58], "being": [42, 57, 58], "hardcod": [42, 57, 58], "composit": [42, 57, 58], "log_prob": [42, 57, 58], "hard": [42, 54, 57, 58], "2211": [42, 57, 58], "00484": [42, 57, 58], "fast_beam_search_lg": [42, 57, 58], "trivial": [42, 57, 58], "fast_beam_search_nbest": [42, 57, 58], "random_path": [42, 57, 58], "shortest": [42, 57, 58], "fast_beam_search_nbest_lg": [42, 57, 58], "logic": [42, 57, 58], "smallest": [42, 56, 57, 58], "normal": [43, 47, 48, 50, 57], "icefall_asr_librispeech_tdnn": 43, "lstm_ctc": 43, "flac": 43, "116k": 43, "140k": 43, "343k": 43, "164k": 43, "105k": 43, "174k": 43, "pretraind": 43, "584": [43, 48], "791": 43, "245": 43, "098": 43, "099": 43, "methond": [43, 47, 48], "631": 43, "010": 43, "guidanc": 44, "bigger": 44, "simpli": 44, "discard": 44, "prevent": 44, "lconv": 44, "encourag": [44, 45, 56], "stabil": [44, 45], "doesn": 44, "warm": [44, 45], "xyozukpeqm62hbilud4upa": [44, 45], "ctc_guide_decode_b": 44, "pretrained_ctc": 44, "jit_pretrained_ctc": 44, "100h": 44, "yfyeung": 44, "wechat": 45, "zipformer_mmi": 45, "worker": [45, 56], "hp": 45, "tdnn_ligru_ctc": 47, "enough": [47, 48, 50, 52], "luomingshuang": [47, 48], "icefall_asr_timit_tdnn_ligru_ctc": 47, "pretrained_average_9_25": 47, "fdhc0_si1559": [47, 48], "felc0_si756": [47, 48], "fmgd0_si1564": [47, 48], "ffprobe": [47, 48], "show_format": [47, 48], "nistspher": [47, 48], "database_id": [47, 48], "database_vers": [47, 48], "utterance_id": [47, 48], "dhc0_si1559": [47, 48], "sample_min": [47, 48], "4176": [47, 48], "sample_max": [47, 48], "5984": [47, 48], "bitrat": [47, 48], "pcm_s16le": [47, 48], "s16": [47, 48], "elc0_si756": [47, 48], "1546": [47, 48], "1989": [47, 48], "mgd0_si1564": [47, 48], "7626": [47, 48], "10573": [47, 48], "660": 47, "695": 47, "697": 47, "819": 47, "829": 47, "sil": [47, 48], "dh": [47, 48], "ih": [47, 48], "uw": [47, 48], "ah": [47, 48], "ii": [47, 48], "z": [47, 48], "aa": [47, 48], "ei": [47, 48], "dx": [47, 48], "d": [47, 48, 52], "uh": [47, 48], "ng": [47, 48, 60], "eh": [47, 48], "jh": [47, 48], "er": [47, 48], "ai": [47, 48], "hh": [47, 48], "aw": 47, "ae": [47, 48], "705": 47, "715": 47, "720": 47, "251": [47, 48], "ch": 47, "icefall_asr_timit_tdnn_lstm_ctc": 48, "pretrained_average_16_25": 48, "816": 48, "827": 48, "unk": 48, "739": 48, "977": 48, "978": 48, "981": 48, "ow": 48, "ykubhb5wrmosxykid1z9eg": 50, "23t23": 50, "icefall_asr_yesno_tdnn": 50, "0_0_1_0_0_1_1_1": 50, "0_0_1_0_1_0_0_1": 50, "0_0_1_1_0_0_0_1": 50, "0_0_1_1_0_1_1_0": 50, "0_0_1_1_1_0_0_0": 50, "0_0_1_1_1_1_0_0": 50, "0_1_0_0_0_1_0_0": 50, "0_1_0_0_1_0_1_0": 50, "0_1_0_1_0_0_0_0": 50, "0_1_0_1_1_1_0_0": 50, "0_1_1_0_0_1_1_1": 50, "0_1_1_1_0_0_1_0": 50, "0_1_1_1_1_0_1_0": 50, "1_0_0_0_0_0_0_0": 50, "1_0_0_0_0_0_1_1": 50, "1_0_0_1_0_1_1_1": 50, "1_0_1_1_0_1_1_1": 50, "1_0_1_1_1_1_0_1": 50, "1_1_0_0_0_1_1_1": 50, "1_1_0_0_1_0_1_1": 50, "1_1_0_1_0_1_0_0": 50, "1_1_0_1_1_0_0_1": 50, "1_1_0_1_1_1_1_0": 50, "1_1_1_0_0_1_0_1": 50, "1_1_1_0_1_0_1_0": 50, "1_1_1_1_0_0_1_0": 50, "1_1_1_1_1_0_0_0": 50, "1_1_1_1_1_1_1_1": 50, "54080": 50, "507": 50, "108k": 50, "119": 50, "650": 50, "139": 50, "143": 50, "198": 50, "181": 50, "186": 50, "187": 50, "correctli": 50, "simplest": 50, "nnlm": 52, "complet": 52, "wget": [52, 60], "resourc": 52, "norm": 52, "gzip": 52, "prepare_lm_training_data": 52, "lm_data": 52, "grab": 52, "cup": 52, "coffe": 52, "sort_lm_training_data": 52, "sorted_lm_data": 52, "statist": 52, "lm_data_stat": 52, "aforement": 52, "repeat": 52, "rnn_lm": 52, "tie": 52, "hyper": [52, 60, 61], "coupl": [52, 60, 61], "dai": [52, 60, 61], "former": 54, "mask": [54, 57, 58], "wenet": 54, "did": 54, "request": 54, "complic": 54, "techniqu": 54, "bank": 54, "memor": 54, "histori": 54, "introduc": 54, "variant": 54, "pruned_stateless_emformer_rnnt2": 54, "conv_emformer_transducer_stateless": 54, "ourself": 54, "mechan": 54, "onlin": 56, "lstm_transducer_stateless": 56, "prepare_giga_speech": 56, "cj2vtpiwqhkn9q1tx6ptpg": 56, "dynam": [57, 58], "short": [57, 58], "2012": 57, "05481": 57, "flag": 57, "indic": [57, 58], "whether": 57, "sequenc": [57, 58], "uniformli": [57, 58], "seen": [57, 58], "97vkxf80ru61cnp2alwzzg": 57, "streaming_decod": [57, 58], "wise": [57, 58], "parallel": [57, 58], "bath": [57, 58], "parallelli": [57, 58], "seem": 57, "benefit": 57, "320m": 58, "550": 58, "basicli": 58, "scriptmodul": 58, "jit_trace_export": 58, "jit_trace_pretrain": 58, "monoton": 59, "align": 59, "condit": [60, 61], "variat": [60, 61], "autoencod": [60, 61], "adversari": [60, 61], "piper_phonem": 60, "numba": 60, "espnet_tts_frontend": 60, "monotonic_align": [60, 61], "build_ext": [60, 61], "inplac": [60, 61], "medium": 60, "ground": [60, 61], "truth": [60, 61], "test_onnx": [60, 61], "program": 60, "kotlin": 60, "java": 60, "swift": 60, "offlin": 60, "espeak": 60, "bz2": 60, "xf": 60, "thread": 60, "countri": 60, "plai": 60, "350": 61, "zrjin": 61, "synthesi": 62, "task": 62}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"follow": 0, "code": [0, 9], "style": 0, "contribut": [1, 3], "document": 1, "how": [2, 22, 28, 29], "creat": [2, 13, 21], "recip": [2, 62], "data": [2, 9, 11, 21, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "prepar": [2, 9, 11, 21, 31, 32, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "train": [2, 9, 16, 18, 21, 24, 25, 26, 27, 31, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "decod": [2, 5, 6, 7, 9, 12, 21, 22, 27, 31, 34, 36, 37, 39, 40, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "pre": [2, 18, 24, 25, 26, 27, 31, 32, 33, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58], "model": [2, 5, 15, 18, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 52, 56, 57, 58, 60, 61], "lodr": [4, 6], "rnn": [4, 51, 52], "transduc": [4, 6, 7, 24, 25, 26, 36, 42, 56, 57, 58], "wer": [4, 6, 7, 39], "differ": [4, 6, 7], "beam": [4, 6, 7, 36], "size": [4, 6, 7], "languag": [5, 52], "lm": [6, 39, 51], "rescor": [6, 34, 39], "base": 6, "method": 6, "v": 6, "shallow": [6, 7], "fusion": [6, 7], "The": [6, 36], "number": 6, "each": 6, "field": 6, "i": 6, "test": [6, 7, 21, 24, 25, 26], "clean": [6, 7], "other": 6, "time": [6, 7], "docker": [8, 9], "introduct": [9, 54], "view": 9, "avail": 9, "tag": 9, "cuda": [9, 21], "enabl": 9, "imag": 9, "cpu": 9, "onli": 9, "download": [9, 11, 21, 24, 25, 26, 27, 34, 36, 37, 39, 42, 43, 44, 45, 47, 48, 50, 56, 57, 58, 60, 61], "run": [9, 22, 60], "gpu": 9, "yesno": [9, 49], "within": 9, "contain": 9, "updat": 9, "frequent": 10, "ask": 10, "question": 10, "faq": 10, "oserror": 10, "libtorch_hip": 10, "so": 10, "cannot": 10, "open": 10, "share": 10, "object": 10, "file": [10, 11, 27, 60], "directori": 10, "attributeerror": 10, "modul": 10, "distutil": 10, "ha": 10, "attribut": 10, "version": 10, "importerror": 10, "libpython3": 10, "10": 10, "1": [10, 21, 24, 25, 26, 34, 36, 37, 39], "0": [10, 21], "No": 10, "For": [11, 12, 13, 15, 16], "more": [11, 12, 13, 15, 16], "curiou": [11, 12, 13, 15, 16], "A": 11, "quick": 11, "look": 11, "gener": 11, "environ": [13, 21], "setup": 13, "virtual": [13, 21], "instal": [13, 21, 24, 25, 26, 34, 36, 37, 39, 43, 47, 48, 60], "depend": [13, 60], "icefal": [13, 14, 20, 21, 24, 25, 26], "dummi": 14, "tutori": 14, "export": [15, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 42, 44, 45, 56, 57, 58, 60, 61], "paramet": 15, "via": [15, 24, 25, 26], "state_dict": [15, 22, 42, 44, 45, 56, 57, 58], "torch": [15, 21, 24, 25, 26, 28, 29, 42, 44, 45, 56, 57, 58], "jit": [15, 24, 25, 26, 28, 29, 42, 44, 45, 56, 57, 58], "script": [15, 28, 42, 44, 45, 57, 58], "onnx": [15, 27, 60], "huggingfac": [17, 19], "space": 19, "youtub": [19, 21], "video": [19, 21], "content": [20, 33, 62], "toolkit": 21, "cudnn": 21, "torchaudio": 21, "2": [21, 24, 25, 26, 34, 36, 37, 39], "k2": 21, "3": [21, 24, 25, 26, 34, 36, 39], "lhots": 21, "4": [21, 24, 25, 26], "exampl": [21, 27, 34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "5": [21, 24, 25, 26], "6": [21, 24, 25, 26], "your": 21, "when": [22, 28, 29], "us": [22, 28, 29, 42, 44, 45, 56, 57, 58], "py": 22, "ncnn": [23, 24, 25, 26], "convemform": 24, "pnnx": [24, 25, 26], "trace": [24, 25, 26, 29, 56, 58], "torchscript": [24, 25, 26], "modifi": [24, 25, 26, 36], "encod": [24, 25, 26], "sherpa": [24, 25, 26, 27, 42, 57, 58, 60], "7": [24, 25], "option": [24, 25, 34, 37, 39, 42, 44, 45, 56, 57, 58], "int8": [24, 25], "quantiz": [24, 25], "lstm": [25, 37, 43, 48, 56], "stream": [26, 38, 53, 54, 57, 58], "zipform": [26, 31, 32, 44, 45, 58], "sound": 27, "finetun": [31, 32], "from": [31, 32], "adapt": 31, "fine": [31, 32, 33], "tune": [31, 32, 33], "supervis": 32, "tabl": [33, 62], "conform": [34, 39, 54], "ctc": [34, 37, 39, 43, 44, 47, 48, 50], "configur": [34, 37, 39, 42, 44, 45, 56, 57, 58], "log": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58], "usag": [34, 36, 37, 39, 42, 44, 45, 56, 57, 58, 60], "case": [34, 36, 37, 39], "kaldifeat": [34, 36, 37, 39, 43, 47, 48, 50], "hlg": [34, 37, 39], "attent": [34, 39], "colab": [34, 36, 37, 39, 43, 47, 48, 50], "notebook": [34, 36, 37, 39, 43, 47, 48, 50], "deploy": [34, 39], "c": [34, 39], "aishel": 35, "stateless": 36, "loss": 36, "todo": 36, "greedi": 36, "search": [36, 60, 61], "tdnn": [37, 43, 47, 48, 50], "non": 38, "asr": [38, 53], "comput": 39, "n": 39, "gram": 39, "distil": 40, "hubert": 40, "codebook": 40, "index": 40, "librispeech": [41, 55], "prune": [42, 57], "statelessx": [42, 57], "pretrain": [42, 44, 45, 56, 57, 58, 60, 61], "deploi": [42, 57, 58], "infer": [43, 47, 48, 50, 60, 61], "blank": 44, "skip": 44, "mmi": 45, "timit": 46, "ligru": 47, "an": 52, "emform": 54, "which": 56, "simul": [57, 58], "real": [57, 58], "tt": 59, "vit": [60, 61], "ljspeech": 60, "extra": 60, "build": [60, 61], "monoton": [60, 61], "align": [60, 61], "lexicon": 60, "vctk": 61}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"Follow the code style": [[0, "follow-the-code-style"]], "Contributing to Documentation": [[1, "contributing-to-documentation"]], "How to create a recipe": [[2, "how-to-create-a-recipe"]], "Data Preparation": [[2, "data-preparation"], [11, "data-preparation"], [36, "data-preparation"]], "Training": [[2, "training"], [9, "training"], [16, "training"], [21, "training"], [34, "training"], [36, "training"], [37, "training"], [39, "training"], [40, "training"], [42, "training"], [43, "training"], [44, "training"], [45, "training"], [47, "training"], [48, "training"], [50, "training"], [56, "training"], [57, "training"], [58, "training"], [60, "training"], [61, "training"]], "Decoding": [[2, "decoding"], [9, "decoding"], [12, "decoding"], [21, "decoding"], [31, "decoding"], [34, "decoding"], [36, "decoding"], [37, "decoding"], [39, "decoding"], [40, "decoding"], [42, "decoding"], [43, "decoding"], [44, "decoding"], [45, "decoding"], [47, "decoding"], [48, "decoding"], [50, "decoding"], [56, "decoding"], [57, "decoding"], [58, "decoding"]], "Pre-trained model": [[2, "pre-trained-model"]], "Contributing": [[3, "contributing"]], "LODR for RNN Transducer": [[4, "lodr-for-rnn-transducer"]], "WER of LODR with different beam sizes": [[4, "id1"]], "Decoding with language models": [[5, "decoding-with-language-models"]], "LM rescoring for Transducer": [[6, "lm-rescoring-for-transducer"]], "WERs of LM rescoring with different beam sizes": [[6, "id1"]], "WERs of LM rescoring + LODR with different beam sizes": [[6, "id2"]], "LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)": [[6, "id3"]], "Shallow fusion for Transducer": [[7, "shallow-fusion-for-transducer"]], "WERs and decoding time (on test-clean) of shallow fusion with different beam sizes": [[7, "id2"]], "Docker": [[8, "docker"]], "Introduction": [[9, "introduction"], [54, "introduction"]], "View available tags": [[9, "view-available-tags"]], "CUDA-enabled docker images": [[9, "cuda-enabled-docker-images"]], "CPU-only docker images": [[9, "cpu-only-docker-images"]], "Download a docker image (CUDA)": [[9, "download-a-docker-image-cuda"]], "Download a docker image (CPU)": [[9, "download-a-docker-image-cpu"]], "Run a docker image with GPU": [[9, "run-a-docker-image-with-gpu"]], "Run a docker image with CPU": [[9, "run-a-docker-image-with-cpu"]], "Run yesno within a docker container": [[9, "run-yesno-within-a-docker-container"]], "Update the code": [[9, "update-the-code"]], "Data preparation": [[9, "data-preparation"], [21, "data-preparation"], [31, "data-preparation"], [32, "data-preparation"], [34, "data-preparation"], [37, "data-preparation"], [39, "data-preparation"], [40, "data-preparation"], [42, "data-preparation"], [43, "data-preparation"], [44, "data-preparation"], [45, "data-preparation"], [47, "data-preparation"], [48, "data-preparation"], [50, "data-preparation"], [56, "data-preparation"], [57, "data-preparation"], [58, "data-preparation"], [60, "data-preparation"], [61, "data-preparation"]], "Frequently Asked Questions (FAQs)": [[10, "frequently-asked-questions-faqs"]], "OSError: libtorch_hip.so: cannot open shared object file: no such file or directory": [[10, "oserror-libtorch-hip-so-cannot-open-shared-object-file-no-such-file-or-directory"]], "AttributeError: module \u2018distutils\u2019 has no attribute \u2018version\u2019": [[10, "attributeerror-module-distutils-has-no-attribute-version"]], "ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory": [[10, "importerror-libpython3-10-so-1-0-cannot-open-shared-object-file-no-such-file-or-directory"]], "For the more curious": [[11, "for-the-more-curious"], [12, "for-the-more-curious"], [13, "for-the-more-curious"], [15, "for-the-more-curious"], [16, "for-the-more-curious"]], "A quick look to the generated files": [[11, "a-quick-look-to-the-generated-files"]], "download": [[11, "download"]], "data": [[11, "data"]], "Environment setup": [[13, "environment-setup"]], "Create a virtual environment": [[13, "create-a-virtual-environment"]], "Install dependencies": [[13, "install-dependencies"]], "Install icefall": [[13, "install-icefall"]], "Icefall for dummies tutorial": [[14, "icefall-for-dummies-tutorial"]], "Model Export": [[15, "model-export"]], "Export the model parameters via model.state_dict()": [[15, "export-the-model-parameters-via-model-state-dict"]], "Export via torch.jit.script()": [[15, "export-via-torch-jit-script"]], "Export via torch.onnx.export()": [[15, "export-via-torch-onnx-export"]], "Huggingface": [[17, "huggingface"]], "Pre-trained models": [[18, "pre-trained-models"]], "Huggingface spaces": [[19, "huggingface-spaces"]], "YouTube Video": [[19, "youtube-video"], [21, "youtube-video"]], "Icefall": [[20, "icefall"]], "Contents:": [[20, null]], "Installation": [[21, "installation"]], "(0) Install CUDA toolkit and cuDNN": [[21, "install-cuda-toolkit-and-cudnn"]], "(1) Install torch and torchaudio": [[21, "install-torch-and-torchaudio"]], "(2) Install k2": [[21, "install-k2"]], "(3) Install lhotse": [[21, "install-lhotse"]], "(4) Download icefall": [[21, "download-icefall"]], "Installation example": [[21, "installation-example"]], "(1) Create a virtual environment": [[21, "create-a-virtual-environment"]], "(2) Install CUDA toolkit and cuDNN": [[21, "id1"]], "(3) Install torch and torchaudio": [[21, "id2"]], "(4) Install k2": [[21, "id3"]], "(5) Install lhotse": [[21, "id5"]], "(6) Download icefall": [[21, "id6"]], "Test Your Installation": [[21, "test-your-installation"]], "Export model.state_dict()": [[22, "export-model-state-dict"], [42, "export-model-state-dict"], [44, "export-model-state-dict"], [45, "export-model-state-dict"], [56, "export-model-state-dict"], [57, "export-model-state-dict"], [58, "export-model-state-dict"]], "When to use it": [[22, "when-to-use-it"], [28, "when-to-use-it"], [29, "when-to-use-it"]], "How to export": [[22, "how-to-export"], [28, "how-to-export"], [29, "how-to-export"]], "How to use the exported model": [[22, "how-to-use-the-exported-model"], [28, "how-to-use-the-exported-model"]], "Use the exported model to run decode.py": [[22, "use-the-exported-model-to-run-decode-py"]], "Export to ncnn": [[23, "export-to-ncnn"]], "Export ConvEmformer transducer models to ncnn": [[24, "export-convemformer-transducer-models-to-ncnn"]], "1. Download the pre-trained model": [[24, "download-the-pre-trained-model"], [25, "download-the-pre-trained-model"], [26, "download-the-pre-trained-model"]], "2. Install ncnn and pnnx": [[24, "install-ncnn-and-pnnx"], [25, "install-ncnn-and-pnnx"], [26, "install-ncnn-and-pnnx"]], "3. Export the model via torch.jit.trace()": [[24, "export-the-model-via-torch-jit-trace"], [25, "export-the-model-via-torch-jit-trace"], [26, "export-the-model-via-torch-jit-trace"]], "4. Export torchscript model via pnnx": [[24, "export-torchscript-model-via-pnnx"], [25, "export-torchscript-model-via-pnnx"], [26, "export-torchscript-model-via-pnnx"]], "5. Test the exported models in icefall": [[24, "test-the-exported-models-in-icefall"], [25, "test-the-exported-models-in-icefall"], [26, "test-the-exported-models-in-icefall"]], "6. Modify the exported encoder for sherpa-ncnn": [[24, "modify-the-exported-encoder-for-sherpa-ncnn"], [25, "modify-the-exported-encoder-for-sherpa-ncnn"], [26, "modify-the-exported-encoder-for-sherpa-ncnn"]], "7. (Optional) int8 quantization with sherpa-ncnn": [[24, "optional-int8-quantization-with-sherpa-ncnn"], [25, "optional-int8-quantization-with-sherpa-ncnn"]], "Export LSTM transducer models to ncnn": [[25, "export-lstm-transducer-models-to-ncnn"]], "Export streaming Zipformer transducer models to ncnn": [[26, "export-streaming-zipformer-transducer-models-to-ncnn"]], "Export to ONNX": [[27, "export-to-onnx"]], "sherpa-onnx": [[27, "sherpa-onnx"]], "Example": [[27, "example"]], "Download the pre-trained model": [[27, "download-the-pre-trained-model"], [34, "download-the-pre-trained-model"], [36, "download-the-pre-trained-model"], [37, "download-the-pre-trained-model"], [39, "download-the-pre-trained-model"], [43, "download-the-pre-trained-model"], [47, "download-the-pre-trained-model"], [48, "download-the-pre-trained-model"], [50, "download-the-pre-trained-model"]], "Export the model to ONNX": [[27, "export-the-model-to-onnx"]], "Decode sound files with exported ONNX models": [[27, "decode-sound-files-with-exported-onnx-models"]], "Export model with torch.jit.script()": [[28, "export-model-with-torch-jit-script"]], "Export model with torch.jit.trace()": [[29, "export-model-with-torch-jit-trace"]], "How to use the exported models": [[29, "how-to-use-the-exported-models"]], "Model export": [[30, "model-export"]], "Finetune from a pre-trained Zipformer model with adapters": [[31, "finetune-from-a-pre-trained-zipformer-model-with-adapters"]], "Model preparation": [[31, "model-preparation"], [32, "model-preparation"]], "Fine-tune with adapter": [[31, "fine-tune-with-adapter"]], "Export the model": [[31, "export-the-model"]], "Finetune from a supervised pre-trained Zipformer model": [[32, "finetune-from-a-supervised-pre-trained-zipformer-model"]], "Fine-tune": [[32, "fine-tune"]], "Fine-tune a pre-trained model": [[33, "fine-tune-a-pre-trained-model"]], "Table of Contents": [[33, null], [62, null]], "Conformer CTC": [[34, "conformer-ctc"], [39, "conformer-ctc"]], "Configurable options": [[34, "configurable-options"], [37, "configurable-options"], [39, "configurable-options"], [42, "configurable-options"], [44, "configurable-options"], [45, "configurable-options"], [56, "configurable-options"], [57, "configurable-options"], [58, "configurable-options"]], "Pre-configured options": [[34, "pre-configured-options"], [37, "pre-configured-options"], [39, "pre-configured-options"], [42, "pre-configured-options"], [44, "pre-configured-options"], [45, "pre-configured-options"], [56, "pre-configured-options"], [57, "pre-configured-options"], [58, "pre-configured-options"]], "Training logs": [[34, "training-logs"], [36, "training-logs"], [37, "training-logs"], [39, "training-logs"], [42, "training-logs"], [44, "training-logs"], [45, "training-logs"], [56, "training-logs"], [57, "training-logs"], [58, "training-logs"]], "Usage examples": [[34, "usage-examples"], [36, "usage-examples"], [37, "usage-examples"], [39, "usage-examples"]], "Case 1": [[34, "case-1"], [36, "case-1"], [37, "case-1"], [39, "case-1"]], "Case 2": [[34, "case-2"], [36, "case-2"], [37, "case-2"], [39, "case-2"]], "Case 3": [[34, "case-3"], [36, "case-3"], [39, "case-3"]], "Pre-trained Model": [[34, "pre-trained-model"], [36, "pre-trained-model"], [37, "pre-trained-model"], [39, "pre-trained-model"], [43, "pre-trained-model"], [47, "pre-trained-model"], [48, "pre-trained-model"], [50, "pre-trained-model"]], "Install kaldifeat": [[34, "install-kaldifeat"], [36, "install-kaldifeat"], [37, "install-kaldifeat"], [39, "install-kaldifeat"], [43, "install-kaldifeat"], [47, "install-kaldifeat"], [48, "install-kaldifeat"]], "Usage": [[34, "usage"], [36, "usage"], [37, "usage"], [39, "usage"]], "CTC decoding": [[34, "ctc-decoding"], [39, "ctc-decoding"], [39, "id2"]], "HLG decoding": [[34, "hlg-decoding"], [34, "id2"], [37, "hlg-decoding"], [39, "hlg-decoding"], [39, "id3"]], "HLG decoding + attention decoder rescoring": [[34, "hlg-decoding-attention-decoder-rescoring"]], "Colab notebook": [[34, "colab-notebook"], [36, "colab-notebook"], [37, "colab-notebook"], [39, "colab-notebook"], [43, "colab-notebook"], [47, "colab-notebook"], [48, "colab-notebook"], [50, "colab-notebook"]], "Deployment with C++": [[34, "deployment-with-c"], [39, "deployment-with-c"]], "aishell": [[35, "aishell"]], "Stateless Transducer": [[36, "stateless-transducer"]], "The Model": [[36, "the-model"]], "The Loss": [[36, "the-loss"]], "Todo": [[36, "id1"]], "Greedy search": [[36, "greedy-search"]], "Beam search": [[36, "beam-search"]], "Modified Beam search": [[36, "modified-beam-search"]], "TDNN-LSTM CTC": [[37, "tdnn-lstm-ctc"]], "Non Streaming ASR": [[38, "non-streaming-asr"]], "HLG decoding + LM rescoring": [[39, "hlg-decoding-lm-rescoring"]], "HLG decoding + LM rescoring + attention decoder rescoring": [[39, "hlg-decoding-lm-rescoring-attention-decoder-rescoring"]], "Compute WER with the pre-trained model": [[39, "compute-wer-with-the-pre-trained-model"]], "HLG decoding + n-gram LM rescoring": [[39, "hlg-decoding-n-gram-lm-rescoring"]], "HLG decoding + n-gram LM rescoring + attention decoder rescoring": [[39, "hlg-decoding-n-gram-lm-rescoring-attention-decoder-rescoring"]], "Distillation with HuBERT": [[40, "distillation-with-hubert"]], "Codebook index preparation": [[40, "codebook-index-preparation"]], "LibriSpeech": [[41, "librispeech"], [55, "librispeech"]], "Pruned transducer statelessX": [[42, "pruned-transducer-statelessx"], [57, "pruned-transducer-statelessx"]], "Usage example": [[42, "usage-example"], [44, "usage-example"], [45, "usage-example"], [56, "usage-example"], [57, "usage-example"], [58, "usage-example"]], "Export Model": [[42, "export-model"], [57, "export-model"], [58, "export-model"]], "Export model using torch.jit.script()": [[42, "export-model-using-torch-jit-script"], [44, "export-model-using-torch-jit-script"], [45, "export-model-using-torch-jit-script"], [57, "export-model-using-torch-jit-script"], [58, "export-model-using-torch-jit-script"]], "Download pretrained models": [[42, "download-pretrained-models"], [44, "download-pretrained-models"], [45, "download-pretrained-models"], [56, "download-pretrained-models"], [57, "download-pretrained-models"], [58, "download-pretrained-models"], [60, "download-pretrained-models"], [61, "download-pretrained-models"]], "Deploy with Sherpa": [[42, "deploy-with-sherpa"], [57, "deploy-with-sherpa"], [58, "deploy-with-sherpa"]], "TDNN-LSTM-CTC": [[43, "tdnn-lstm-ctc"], [48, "tdnn-lstm-ctc"]], "Inference with a pre-trained model": [[43, "inference-with-a-pre-trained-model"], [47, "inference-with-a-pre-trained-model"], [48, "inference-with-a-pre-trained-model"], [50, "inference-with-a-pre-trained-model"]], "Zipformer CTC Blank Skip": [[44, "zipformer-ctc-blank-skip"]], "Export models": [[44, "export-models"], [45, "export-models"], [56, "export-models"], [60, "export-models"], [61, "export-models"]], "Zipformer MMI": [[45, "zipformer-mmi"]], "TIMIT": [[46, "timit"]], "TDNN-LiGRU-CTC": [[47, "tdnn-ligru-ctc"]], "YesNo": [[49, "yesno"]], "TDNN-CTC": [[50, "tdnn-ctc"]], "Download kaldifeat": [[50, "download-kaldifeat"]], "RNN-LM": [[51, "rnn-lm"]], "Train an RNN language model": [[52, "train-an-rnn-language-model"]], "Streaming ASR": [[53, "streaming-asr"]], "Streaming Conformer": [[54, "streaming-conformer"]], "Streaming Emformer": [[54, "streaming-emformer"]], "LSTM Transducer": [[56, "lstm-transducer"]], "Which model to use": [[56, "which-model-to-use"]], "Export model using torch.jit.trace()": [[56, "export-model-using-torch-jit-trace"], [58, "export-model-using-torch-jit-trace"]], "Simulate streaming decoding": [[57, "simulate-streaming-decoding"], [58, "simulate-streaming-decoding"]], "Real streaming decoding": [[57, "real-streaming-decoding"], [58, "real-streaming-decoding"]], "Zipformer Transducer": [[58, "zipformer-transducer"]], "TTS": [[59, "tts"]], "VITS-LJSpeech": [[60, "vits-ljspeech"]], "Install extra dependencies": [[60, "install-extra-dependencies"]], "Build Monotonic Alignment Search": [[60, "build-monotonic-alignment-search"], [61, "build-monotonic-alignment-search"]], "Inference": [[60, "inference"], [61, "inference"]], "Usage in sherpa-onnx": [[60, "usage-in-sherpa-onnx"]], "Install sherpa-onnx": [[60, "install-sherpa-onnx"]], "Download lexicon files": [[60, "download-lexicon-files"]], "Run sherpa-onnx": [[60, "run-sherpa-onnx"]], "VITS-VCTK": [[61, "vits-vctk"]], "Recipes": [[62, "recipes"]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["contributing/code-style", "contributing/doc", "contributing/how-to-create-a-recipe", "contributing/index", "decoding-with-langugage-models/LODR", "decoding-with-langugage-models/index", "decoding-with-langugage-models/rescoring", "decoding-with-langugage-models/shallow-fusion", "docker/index", "docker/intro", "faqs", "for-dummies/data-preparation", "for-dummies/decoding", "for-dummies/environment-setup", "for-dummies/index", "for-dummies/model-export", "for-dummies/training", "fst-based-forced-alignment/diff", "fst-based-forced-alignment/index", "fst-based-forced-alignment/k2-based", "fst-based-forced-alignment/kaldi-based", "huggingface/index", "huggingface/pretrained-models", "huggingface/spaces", "index", "installation/index", "model-export/export-model-state-dict", "model-export/export-ncnn", "model-export/export-ncnn-conv-emformer", "model-export/export-ncnn-lstm", "model-export/export-ncnn-zipformer", "model-export/export-onnx", "model-export/export-with-torch-jit-script", "model-export/export-with-torch-jit-trace", "model-export/index", "recipes/Finetune/adapter/finetune_adapter", "recipes/Finetune/from_supervised/finetune_zipformer", "recipes/Finetune/index", "recipes/Non-streaming-ASR/aishell/conformer_ctc", "recipes/Non-streaming-ASR/aishell/index", "recipes/Non-streaming-ASR/aishell/stateless_transducer", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/index", "recipes/Non-streaming-ASR/librispeech/conformer_ctc", "recipes/Non-streaming-ASR/librispeech/distillation", "recipes/Non-streaming-ASR/librispeech/index", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi", "recipes/Non-streaming-ASR/timit/index", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc", "recipes/Non-streaming-ASR/yesno/index", "recipes/Non-streaming-ASR/yesno/tdnn", "recipes/RNN-LM/index", "recipes/RNN-LM/librispeech/lm-training", "recipes/Streaming-ASR/index", "recipes/Streaming-ASR/introduction", "recipes/Streaming-ASR/librispeech/index", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless", "recipes/Streaming-ASR/librispeech/zipformer_transducer", "recipes/TTS/index", "recipes/TTS/ljspeech/vits", "recipes/TTS/vctk/vits", "recipes/index"], "filenames": ["contributing/code-style.rst", "contributing/doc.rst", "contributing/how-to-create-a-recipe.rst", "contributing/index.rst", "decoding-with-langugage-models/LODR.rst", "decoding-with-langugage-models/index.rst", "decoding-with-langugage-models/rescoring.rst", "decoding-with-langugage-models/shallow-fusion.rst", "docker/index.rst", "docker/intro.rst", "faqs.rst", "for-dummies/data-preparation.rst", "for-dummies/decoding.rst", "for-dummies/environment-setup.rst", "for-dummies/index.rst", "for-dummies/model-export.rst", "for-dummies/training.rst", "fst-based-forced-alignment/diff.rst", "fst-based-forced-alignment/index.rst", "fst-based-forced-alignment/k2-based.rst", "fst-based-forced-alignment/kaldi-based.rst", "huggingface/index.rst", "huggingface/pretrained-models.rst", "huggingface/spaces.rst", "index.rst", "installation/index.rst", "model-export/export-model-state-dict.rst", "model-export/export-ncnn.rst", "model-export/export-ncnn-conv-emformer.rst", "model-export/export-ncnn-lstm.rst", "model-export/export-ncnn-zipformer.rst", "model-export/export-onnx.rst", "model-export/export-with-torch-jit-script.rst", "model-export/export-with-torch-jit-trace.rst", "model-export/index.rst", "recipes/Finetune/adapter/finetune_adapter.rst", "recipes/Finetune/from_supervised/finetune_zipformer.rst", "recipes/Finetune/index.rst", "recipes/Non-streaming-ASR/aishell/conformer_ctc.rst", "recipes/Non-streaming-ASR/aishell/index.rst", "recipes/Non-streaming-ASR/aishell/stateless_transducer.rst", "recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/index.rst", "recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst", "recipes/Non-streaming-ASR/librispeech/distillation.rst", "recipes/Non-streaming-ASR/librispeech/index.rst", "recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst", "recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst", "recipes/Non-streaming-ASR/timit/index.rst", "recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst", "recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst", "recipes/Non-streaming-ASR/yesno/index.rst", "recipes/Non-streaming-ASR/yesno/tdnn.rst", "recipes/RNN-LM/index.rst", "recipes/RNN-LM/librispeech/lm-training.rst", "recipes/Streaming-ASR/index.rst", "recipes/Streaming-ASR/introduction.rst", "recipes/Streaming-ASR/librispeech/index.rst", "recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst", "recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst", "recipes/Streaming-ASR/librispeech/zipformer_transducer.rst", "recipes/TTS/index.rst", "recipes/TTS/ljspeech/vits.rst", "recipes/TTS/vctk/vits.rst", "recipes/index.rst"], "titles": ["Follow the code style", "Contributing to Documentation", "How to create a recipe", "Contributing", "LODR for RNN Transducer", "Decoding with language models", "LM rescoring for Transducer", "Shallow fusion for Transducer", "Docker", "Introduction", "Frequently Asked Questions (FAQs)", "Data Preparation", "Decoding", "Environment setup", "Icefall for dummies tutorial", "Model Export", "Training", "Two approaches", "FST-based forced alignment", "k2-based forced alignment", "Kaldi-based forced alignment", "Huggingface", "Pre-trained models", "Huggingface spaces", "Icefall", "Installation", "Export model.state_dict()", "Export to ncnn", "Export ConvEmformer transducer models to ncnn", "Export LSTM transducer models to ncnn", "Export streaming Zipformer transducer models to ncnn", "Export to ONNX", "Export model with torch.jit.script()", "Export model with torch.jit.trace()", "Model export", "Finetune from a pre-trained Zipformer model with adapters", "Finetune from a supervised pre-trained Zipformer model", "Fine-tune a pre-trained model", "Conformer CTC", "aishell", "Stateless Transducer", "TDNN-LSTM CTC", "Non Streaming ASR", "Conformer CTC", "Distillation with HuBERT", "LibriSpeech", "Pruned transducer statelessX", "TDNN-LSTM-CTC", "Zipformer CTC Blank Skip", "Zipformer MMI", "TIMIT", "TDNN-LiGRU-CTC", "TDNN-LSTM-CTC", "YesNo", "TDNN-CTC", "RNN-LM", "Train an RNN language model", "Streaming ASR", "Introduction", "LibriSpeech", "LSTM Transducer", "Pruned transducer statelessX", "Zipformer Transducer", "TTS", "VITS-LJSpeech", "VITS-VCTK", "Recipes"], "terms": {"we": [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65, 66], "us": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 23, 24, 25, 27, 28, 29, 30, 31, 34, 35, 36, 38, 39, 40, 41, 43, 44, 47, 51, 52, 54, 56, 58, 64, 65], "tool": [0, 10, 25, 28], "make": [0, 1, 3, 20, 28, 29, 30, 35, 38, 40, 43, 58], "consist": [0, 40, 46, 60, 61, 62], "possibl": [0, 2, 3, 38, 43], "black": 0, "format": [0, 28, 29, 30, 35, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "flake8": 0, "check": [0, 25, 43, 56, 64], "qualiti": [0, 39], "isort": 0, "sort": [0, 25, 56], "import": [0, 9, 10, 15, 20, 25, 28, 61, 62], "The": [0, 1, 2, 4, 5, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 23, 25, 26, 28, 29, 30, 35, 36, 38, 39, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "version": [0, 9, 13, 15, 24, 25, 26, 28, 29, 30, 38, 40, 41, 43, 46, 47, 51, 52, 61], "abov": [0, 4, 6, 7, 10, 13, 15, 20, 26, 28, 29, 30, 31, 38, 39, 40, 41, 43, 46, 48, 49, 54, 58, 60, 61, 62, 64], "ar": [0, 1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 16, 25, 26, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "22": [0, 9, 15, 20, 25, 28, 29, 35, 43, 51, 52, 54], "3": [0, 4, 6, 7, 9, 10, 11, 15, 20, 24, 26, 27, 31, 34, 35, 41, 44, 46, 47, 48, 49, 54, 56, 60, 61, 62, 64, 65], "0": [0, 1, 4, 6, 7, 9, 11, 13, 15, 20, 24, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "5": [0, 7, 15, 20, 27, 34, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64], "4": [0, 4, 5, 6, 7, 9, 10, 11, 13, 15, 20, 24, 26, 27, 34, 35, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "10": [0, 7, 9, 15, 20, 24, 25, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "1": [0, 4, 6, 7, 9, 11, 13, 15, 20, 24, 26, 27, 31, 32, 33, 34, 35, 36, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "after": [0, 1, 6, 9, 11, 12, 13, 16, 23, 25, 26, 28, 29, 30, 35, 36, 37, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64], "run": [0, 2, 8, 10, 11, 13, 14, 15, 23, 24, 25, 28, 29, 30, 31, 34, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 65], "command": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13, 15, 16, 20, 25, 26, 28, 29, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "git": [0, 4, 6, 7, 9, 13, 15, 25, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 47, 51, 52, 54, 56], "clone": [0, 4, 6, 7, 9, 13, 25, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 47, 51, 52, 54, 56], "http": [0, 1, 2, 4, 6, 7, 9, 10, 11, 13, 15, 20, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "github": [0, 2, 6, 9, 11, 13, 15, 22, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64], "com": [0, 2, 6, 9, 11, 13, 22, 23, 25, 26, 28, 29, 32, 33, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64], "k2": [0, 2, 9, 10, 13, 15, 17, 18, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 60, 61, 62, 64], "fsa": [0, 2, 9, 13, 15, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 40, 43, 46, 48, 49, 60, 61, 62, 64], "icefal": [0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 20, 22, 23, 26, 27, 31, 32, 33, 34, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65, 66], "cd": [0, 1, 2, 4, 6, 7, 9, 10, 11, 12, 13, 15, 16, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "pip": [0, 1, 6, 10, 13, 15, 25, 28, 31, 40, 64], "instal": [0, 1, 4, 6, 10, 14, 15, 17, 20, 21, 23, 24, 26, 27, 31, 34, 35, 36, 44, 46, 48, 49, 54, 60, 61, 62, 63], "pre": [0, 3, 4, 6, 7, 8, 9, 15, 21, 23, 24, 25, 27, 34, 44, 64, 66], "commit": [0, 25], "whenev": 0, "you": [0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "automat": [0, 14, 23, 44], "hook": 0, "invok": 0, "fail": [0, 20], "If": [0, 2, 4, 6, 7, 8, 9, 10, 11, 13, 15, 20, 23, 28, 29, 30, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "ani": [0, 4, 6, 7, 13, 20, 25, 38, 40, 41, 43, 44, 46, 48, 49, 54, 60, 61], "your": [0, 1, 2, 4, 6, 7, 9, 11, 13, 20, 21, 23, 24, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64], "wa": [0, 26, 43, 47], "success": [0, 25, 28, 29], "pleas": [0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 13, 14, 15, 23, 25, 27, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "fix": [0, 9, 10, 13, 28, 29, 30, 43], "issu": [0, 4, 6, 7, 10, 25, 28, 29, 43, 44, 61, 62], "report": [0, 9, 10, 35, 44], "some": [0, 1, 4, 6, 9, 26, 28, 29, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "i": [0, 1, 2, 4, 5, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 23, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64], "e": [0, 2, 4, 5, 6, 7, 13, 20, 28, 29, 30, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "modifi": [0, 20, 27, 34, 38, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62], "file": [0, 2, 9, 14, 15, 18, 23, 24, 26, 28, 29, 30, 32, 33, 34, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 65], "place": [0, 25, 26, 40, 43, 47], "so": [0, 4, 6, 7, 9, 13, 23, 24, 25, 26, 28, 29, 30, 35, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "statu": 0, "failur": 0, "see": [0, 1, 6, 7, 9, 15, 23, 25, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62], "which": [0, 2, 4, 6, 7, 9, 11, 12, 15, 17, 23, 25, 26, 28, 29, 30, 31, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 61, 62, 64], "ha": [0, 2, 17, 24, 25, 27, 28, 29, 30, 31, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 58, 60, 61, 62], "been": [0, 25, 27, 28, 29, 30, 40], "befor": [0, 1, 11, 13, 15, 20, 25, 26, 28, 29, 30, 31, 32, 35, 36, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "further": [0, 4, 6, 7, 15], "chang": [0, 4, 6, 7, 10, 20, 25, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "all": [0, 9, 11, 13, 14, 17, 20, 22, 23, 26, 28, 29, 30, 32, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62], "again": [0, 28, 29, 54], "should": [0, 2, 4, 6, 11, 13, 20, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64], "succe": 0, "thi": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 18, 20, 21, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65, 66], "time": [0, 20, 25, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "succeed": [0, 20], "want": [0, 4, 6, 7, 11, 13, 15, 25, 26, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64, 65], "can": [0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "do": [0, 2, 4, 6, 13, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64], "Or": 0, "without": [0, 4, 6, 7, 9, 15, 17, 20, 21, 23, 38, 43], "your_changed_fil": 0, "py": [0, 2, 4, 6, 7, 9, 10, 11, 12, 13, 15, 16, 20, 25, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "sphinx": 1, "write": [1, 2, 3, 20], "have": [1, 2, 4, 6, 7, 8, 9, 11, 13, 20, 22, 23, 25, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "prepar": [1, 3, 4, 8, 14, 16, 18, 24, 26, 37, 63], "environ": [1, 10, 11, 12, 14, 16, 18, 24, 28, 29, 30, 35, 36, 38, 39, 40, 41, 43, 44, 46, 47, 51, 52, 54, 61, 62], "doc": [1, 26, 58], "r": [1, 13, 20, 25, 28, 29, 30, 51, 52], "requir": [1, 4, 6, 11, 13, 15, 25, 30, 35, 36, 44, 56, 61, 62, 64, 65], "txt": [1, 4, 9, 11, 13, 15, 20, 25, 26, 28, 29, 30, 31, 32, 33, 35, 38, 40, 41, 43, 47, 51, 52, 54, 56, 64, 65], "set": [1, 4, 6, 7, 10, 12, 13, 16, 20, 25, 28, 29, 30, 35, 36, 37, 38, 40, 41, 43, 44, 46, 48, 49, 54, 56, 60, 61, 62], "up": [1, 25, 26, 28, 29, 30, 35, 38, 41, 43, 44, 46, 47, 48, 49, 61, 62], "readi": [1, 20, 38, 43, 44, 56], "refer": [1, 2, 5, 6, 7, 11, 13, 15, 18, 20, 25, 26, 27, 28, 29, 30, 32, 33, 35, 38, 40, 41, 43, 46, 47, 48, 51, 52, 54, 56, 58, 61, 62, 64], "restructuredtext": 1, "primer": 1, "familiar": 1, "build": [1, 9, 15, 25, 26, 28, 29, 30, 38, 40, 43, 63], "local": [1, 9, 15, 20, 25, 46, 48, 49, 56, 60, 61, 62], "preview": 1, "what": [1, 2, 11, 15, 20, 25, 28, 29, 30, 40, 58, 64], "look": [1, 2, 4, 6, 7, 14, 20, 22, 25, 28, 29, 30, 38, 40, 41, 43, 44], "like": [1, 2, 9, 11, 20, 23, 28, 29, 30, 38, 40, 41, 43, 46, 48, 49, 54, 58, 60, 61], "publish": [1, 26, 39], "html": [1, 2, 10, 11, 13, 15, 20, 25, 27, 28, 29, 30, 31, 32, 33, 46, 60, 61, 62, 64], "gener": [1, 6, 9, 14, 15, 18, 26, 28, 29, 30, 31, 32, 33, 37, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62, 64, 65], "view": [1, 8, 24, 28, 29, 30, 38, 40, 41, 43, 46, 48, 49, 54, 60, 61, 62], "follow": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "python3": [1, 9, 10, 13, 15, 25, 29, 30], "m": [1, 15, 20, 25, 28, 29, 30, 40, 46, 48, 49, 51, 52, 60, 61, 62], "server": [1, 23, 60], "It": [1, 2, 6, 7, 9, 11, 14, 15, 20, 21, 25, 27, 28, 29, 30, 31, 32, 33, 35, 38, 39, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64, 65], "print": [1, 12, 16, 20, 25, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "serv": [1, 46, 48, 49, 60, 61, 62], "port": [1, 14, 17, 35, 36, 44, 46, 48, 49, 60, 61, 62], "8000": [1, 11, 15, 54], "open": [1, 4, 6, 7, 9, 20, 24, 26, 28, 29, 30, 39, 40, 43, 44], "browser": [1, 20, 21, 23, 46, 48, 49, 60, 61, 62], "go": [1, 7, 38, 40, 43, 46, 48, 49, 60, 61, 62, 64], "read": [2, 11, 15, 20, 25, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "code": [2, 3, 8, 10, 13, 15, 17, 20, 24, 25, 28, 29, 30, 38, 43, 44, 46, 47, 51, 52, 54, 58, 61, 62], "style": [2, 3, 24], "adjust": [2, 56, 64, 65], "design": 2, "python": [2, 9, 13, 15, 17, 25, 26, 28, 29, 30, 31, 32, 33, 38, 40, 43, 46, 48, 49, 56, 60, 61, 62, 64, 65], "recommend": [2, 6, 7, 9, 25, 35, 36, 38, 40, 41, 43, 44, 46, 61, 62], "test": [2, 4, 9, 15, 18, 24, 26, 27, 34, 35, 36, 38, 40, 41, 43, 44, 47, 48, 51, 52, 56, 64, 65], "valid": [2, 25, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "dataset": [2, 10, 11, 13, 14, 25, 26, 35, 36, 37, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64, 65], "lhots": [2, 9, 11, 13, 15, 24, 26, 28, 29, 30, 38, 40, 43], "readthedoc": [2, 11, 25], "io": [2, 9, 11, 13, 15, 25, 27, 28, 29, 30, 31, 32, 33, 46, 60, 61, 62, 64], "en": [2, 11, 25, 28], "latest": [2, 9, 11, 13, 23, 25, 43, 44, 46, 47, 48, 49, 60, 61, 62], "index": [2, 25, 27, 28, 29, 30, 31, 32, 33, 60, 61, 62], "yesno": [2, 8, 10, 11, 12, 13, 14, 15, 16, 24, 25, 42, 54, 66], "veri": [2, 3, 7, 13, 28, 29, 30, 35, 36, 40, 51, 52, 54, 61, 62], "good": [2, 7], "exampl": [2, 11, 13, 23, 24, 26, 28, 29, 30, 32, 33, 34, 44, 47, 51, 52, 54], "speech": [2, 11, 13, 14, 23, 24, 25, 27, 37, 39, 40, 54, 64, 65, 66], "pull": [2, 4, 6, 7, 9, 28, 29, 30, 31, 35, 36, 38, 40, 43, 56, 58], "380": [2, 28, 52], "show": [2, 4, 6, 7, 9, 15, 23, 25, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "add": [2, 11, 28, 29, 30, 38, 40, 41, 61, 66], "new": [2, 3, 9, 13, 23, 25, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 54, 60, 61, 62], "suppos": [2, 9, 61, 62], "would": [2, 11, 26, 28, 29, 30, 43, 47, 61, 62], "name": [2, 9, 10, 13, 15, 26, 28, 29, 30, 31, 38, 40, 46, 48, 49, 56, 61, 62], "foo": [2, 33, 38, 43, 46, 48, 49, 60, 61, 62], "eg": [2, 9, 10, 11, 12, 15, 16, 20, 22, 25, 26, 28, 29, 30, 31, 32, 33, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "mkdir": [2, 9, 28, 29, 38, 40, 41, 43, 47, 51, 52, 54], "p": [2, 4, 13, 20, 25, 28, 29, 40, 51, 52], "asr": [2, 4, 6, 7, 9, 10, 11, 12, 14, 15, 16, 20, 22, 24, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 66], "touch": 2, "sh": [2, 9, 11, 25, 26, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "chmod": 2, "x": [2, 4, 20, 30, 58], "simpl": [2, 12, 14, 16, 25, 40, 56], "own": [2, 11, 35, 36, 44, 46, 56, 61, 62], "otherwis": [2, 28, 29, 30, 36, 38, 40, 43, 44, 46, 48, 49, 60, 61, 62], "librispeech": [2, 4, 6, 7, 10, 20, 22, 24, 26, 28, 29, 30, 31, 32, 33, 35, 36, 42, 43, 44, 46, 47, 48, 49, 56, 57, 58, 60, 61, 62, 66], "assum": [2, 4, 15, 25, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 44, 46, 47, 51, 52, 54, 56, 60, 61, 62], "fanci": 2, "call": [2, 10, 31, 44, 56], "bar": [2, 33, 38, 43, 46, 48, 49, 60, 61, 62], "organ": 2, "wai": [2, 3, 15, 34, 46, 48, 49, 58, 60, 61, 62], "readm": [2, 38, 40, 41, 43, 47, 51, 52, 54], "md": [2, 22, 26, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "asr_datamodul": [2, 9, 10, 15, 25], "pretrain": [2, 4, 6, 7, 15, 26, 28, 29, 30, 31, 33, 35, 36, 38, 40, 41, 43, 47, 51, 52, 54, 63], "For": [2, 4, 6, 7, 9, 10, 14, 20, 22, 25, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "instanc": [2, 9, 10, 12, 16, 22, 28, 29, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "tdnn": [2, 9, 10, 12, 15, 16, 25, 39, 42, 45, 50, 53], "its": [2, 4, 20, 26, 27, 28, 29, 30, 33, 40, 48, 56], "directori": [2, 9, 11, 13, 24, 25, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "structur": [2, 30], "descript": [2, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "contain": [2, 8, 11, 13, 14, 15, 20, 24, 26, 27, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 66], "inform": [2, 4, 6, 11, 12, 16, 25, 26, 38, 40, 41, 43, 46, 47, 48, 51, 52, 54, 58, 60, 61, 62], "g": [2, 4, 5, 6, 7, 11, 13, 20, 25, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "wer": [2, 5, 9, 12, 15, 25, 26, 35, 36, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "etc": [2, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64], "provid": [2, 11, 15, 23, 25, 26, 27, 28, 29, 30, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 66], "pytorch": [2, 10, 13, 20, 25, 28, 29, 30, 40], "dataload": [2, 25], "take": [2, 7, 9, 26, 44, 46, 54, 56, 61, 62, 64, 65], "input": [2, 26, 28, 29, 30, 38, 40, 41, 43, 47, 51, 52, 54, 58], "checkpoint": [2, 4, 6, 7, 12, 15, 20, 25, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "save": [2, 15, 16, 25, 26, 29, 30, 32, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "dure": [2, 4, 5, 7, 10, 13, 20, 23, 26, 35, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "stage": [2, 25, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "": [2, 4, 6, 7, 9, 14, 15, 16, 20, 25, 26, 28, 29, 30, 31, 32, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "definit": [2, 28, 29], "neural": [2, 4, 6, 7, 35, 38, 43, 56], "network": [2, 35, 38, 40, 43, 46, 48, 49, 56, 60, 61, 62], "script": [2, 6, 7, 13, 14, 24, 25, 33, 34, 38, 40, 41, 43, 44, 47, 51, 52, 54, 56, 60], "infer": [2, 26, 28, 29, 63], "tdnn_lstm_ctc": [2, 41, 47, 52], "conformer_ctc": [2, 38, 43], "get": [2, 9, 13, 14, 15, 18, 23, 25, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 54, 58, 60, 61, 62, 64], "feel": [2, 44, 56, 60], "result": [2, 4, 7, 9, 16, 22, 23, 26, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "everi": [2, 26, 46, 48, 49, 60, 61, 62], "kept": [2, 46, 61, 62], "self": [2, 27, 30, 58], "toler": 2, "duplic": 2, "among": [2, 25], "differ": [2, 9, 12, 18, 25, 28, 29, 30, 31, 35, 36, 37, 38, 39, 43, 44, 46, 58, 60, 61, 62, 64], "invoc": [2, 28, 29], "help": [2, 12, 14, 16, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "blob": [2, 11, 22, 25, 26, 33, 46, 48, 49, 60, 61, 62], "master": [2, 6, 9, 11, 15, 22, 25, 26, 29, 30, 32, 33, 35, 36, 40, 44, 46, 48, 49, 60, 61, 62], "transform": [2, 6, 7, 38, 43, 60], "conform": [2, 32, 39, 40, 42, 45, 46, 48, 60, 61, 62], "base": [2, 4, 7, 13, 17, 24, 30, 35, 36, 38, 40, 41, 43, 44, 46, 48, 49, 56, 60, 61, 62], "lstm": [2, 27, 33, 34, 39, 42, 45, 50, 57, 59], "attent": [2, 30, 40, 41, 44, 58, 61, 62], "lm": [2, 4, 5, 7, 9, 11, 24, 25, 40, 46, 47, 51, 52, 54, 56, 61, 62, 66], "rescor": [2, 5, 24, 41, 47, 49, 51, 52, 54, 56], "demonstr": [2, 14, 15, 21, 23, 26, 31, 35], "consid": [2, 4, 30, 36], "colab": [2, 20, 25], "notebook": [2, 20, 25], "welcom": 3, "There": [3, 4, 15, 28, 29, 30, 31, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "mani": [3, 12, 25, 61, 62], "two": [3, 4, 11, 14, 15, 18, 20, 24, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 65], "them": [3, 5, 6, 21, 22, 23, 28, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "To": [3, 4, 5, 6, 7, 9, 11, 15, 20, 23, 25, 35, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "document": [3, 24, 26, 27, 28, 29, 30, 31, 49, 64], "repositori": [3, 9, 28, 29, 30, 31], "recip": [3, 4, 6, 7, 9, 11, 15, 22, 24, 25, 26, 31, 35, 36, 38, 40, 41, 43, 44, 46, 47, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "In": [3, 4, 6, 10, 15, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 37, 38, 40, 41, 43, 44, 47, 51, 52, 54, 58], "page": [3, 23, 32, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 66], "describ": [3, 5, 8, 9, 17, 18, 20, 21, 26, 28, 29, 31, 32, 33, 34, 38, 40, 41, 43, 46, 47, 51, 52, 61, 62, 64], "how": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 18, 20, 21, 23, 24, 25, 28, 29, 30, 31, 34, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "creat": [3, 4, 6, 7, 14, 15, 18, 24, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61], "data": [3, 4, 6, 7, 8, 13, 14, 15, 16, 18, 24, 26, 28, 29, 30, 31, 32, 33, 37, 39, 56, 63], "train": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 18, 20, 21, 23, 24, 26, 27, 32, 33, 34, 55, 58, 63, 66], "decod": [3, 4, 8, 10, 11, 14, 15, 17, 20, 23, 24, 28, 29, 30, 33, 34, 36, 37, 56], "model": [3, 4, 6, 7, 9, 11, 12, 14, 17, 18, 20, 21, 23, 24, 25, 27, 44, 55, 58, 63, 66], "As": [4, 5, 6, 7, 28, 40, 43, 44, 56], "type": [4, 6, 7, 9, 11, 15, 20, 25, 26, 28, 29, 30, 38, 40, 43, 46, 48, 49, 54, 58, 60, 61, 62, 64], "e2": [4, 7, 25, 56], "usual": [4, 6, 7, 12, 38, 40, 41, 43, 44, 46, 48, 49, 56, 60, 61, 62, 64, 65], "an": [4, 5, 6, 7, 9, 11, 13, 15, 18, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 38, 39, 40, 43, 44, 46, 49, 54, 55, 60, 61, 62, 64, 65, 66], "intern": [4, 5], "languag": [4, 7, 11, 23, 24, 38, 40, 41, 55, 64, 66], "learn": [4, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "level": [4, 5, 15, 20, 56], "corpu": [4, 6, 7, 39, 56], "real": 4, "life": 4, "scenario": 4, "often": [4, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "mismatch": [4, 36, 61], "between": [4, 7, 18, 35, 46, 61, 62], "target": [4, 23, 25, 35], "space": [4, 21, 24, 56], "problem": [4, 6, 7, 25, 44], "when": [4, 6, 9, 10, 15, 23, 28, 29, 30, 34, 40, 43, 44, 46, 48, 49, 56, 61, 62], "act": 4, "against": [4, 25], "extern": [4, 5, 6, 7], "tutori": [4, 5, 6, 7, 13, 15, 18, 20, 24, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 56, 60, 61, 62, 64, 65], "low": [4, 28, 29, 37, 64], "order": [4, 13, 17, 25, 28, 29, 30, 38, 41, 43, 47, 51, 52], "densiti": 4, "ratio": [4, 20], "allevi": 4, "effect": [4, 7, 30, 35], "improv": [4, 5, 6, 7, 35, 37, 40, 56], "perform": [4, 6, 7, 18, 27, 35, 36, 37, 40, 44, 61], "languga": 4, "integr": [4, 23, 35], "pruned_transducer_stateless7_stream": [4, 6, 7, 30, 31, 62], "stream": [4, 6, 7, 15, 17, 24, 27, 28, 29, 31, 34, 38, 43, 51, 52, 60, 66], "howev": [4, 6, 7, 26, 29, 37, 44], "easili": [4, 6, 7, 35, 38, 41, 43], "appli": [4, 6, 7, 40, 58], "other": [4, 7, 9, 13, 14, 15, 20, 26, 29, 30, 31, 35, 40, 43, 44, 46, 47, 51, 52, 54, 58, 61, 62, 66], "encount": [4, 6, 7, 10, 25, 30, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "here": [4, 6, 7, 20, 26, 28, 29, 30, 38, 40, 41, 43, 44, 47, 58, 61], "simplic": [4, 6, 7], "same": [4, 6, 7, 20, 25, 26, 28, 29, 30, 35, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62], "domain": [4, 6, 7, 35, 36, 37], "gigaspeech": [4, 6, 7, 22, 32, 35, 36, 60], "first": [4, 6, 9, 10, 11, 25, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "let": [4, 6, 7, 14, 25, 28, 29, 30, 35, 36, 38, 43, 56], "background": 4, "predecessor": 4, "dr": 4, "propos": [4, 40, 58, 62], "address": [4, 9, 15, 23, 25, 26, 28, 29, 30, 40, 46, 49, 60, 61, 62], "sourc": [4, 11, 13, 25, 26, 28, 29, 30, 38, 39, 40, 43], "acoust": [4, 61, 62], "similar": [4, 5, 36, 44, 48, 61, 62], "deriv": 4, "formula": 4, "bay": 4, "theorem": 4, "text": [4, 6, 7, 11, 16, 20, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "score": [4, 5, 7, 38, 43, 46, 61, 62], "left": [4, 28, 30, 35, 40, 61, 62], "y_u": 4, "mathit": 4, "y": [4, 20], "right": [4, 28, 40, 58, 61], "log": [4, 9, 10, 12, 15, 16, 25, 28, 29, 30, 35, 47, 51, 52, 54, 64, 65], "y_": 4, "u": [4, 20, 25, 28, 29, 30, 38, 40, 41, 43, 44, 54], "lambda_1": 4, "p_": 4, "lambda_2": 4, "where": [4, 9, 10, 61], "weight": [4, 15, 38, 41, 43, 48, 49, 56, 60], "respect": 4, "onli": [4, 6, 8, 11, 13, 14, 15, 20, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64, 65], "compar": [4, 17, 28, 29, 30, 36, 61], "shallow": [4, 5, 24, 56], "fusion": [4, 5, 24, 56], "subtract": [4, 5], "work": [4, 9, 13, 15, 28, 29, 30, 43], "treat": [4, 29, 30], "predictor": 4, "joiner": [4, 28, 29, 30, 31, 33, 35, 36, 40, 46, 60, 61, 62], "weak": 4, "captur": 4, "therefor": [4, 10], "n": [4, 5, 6, 11, 20, 25, 38, 44, 46, 48, 49, 51, 52, 60, 61, 62], "gram": [4, 6, 25, 38, 40, 41, 46, 47, 49, 51, 52, 61, 62], "approxim": [4, 5], "ilm": 4, "lead": [4, 7, 12], "rnnt": [4, 46, 61, 62], "bi": [4, 6], "addit": [4, 37], "estim": 4, "li": 4, "choic": 4, "accord": [4, 56], "origin": [4, 5, 35, 36, 37], "paper": [4, 5, 35, 44, 46, 60, 61, 62, 64, 65], "achiev": [4, 6, 7, 35, 36, 56, 58], "both": [4, 36, 46, 48, 49, 58, 60, 61, 62], "intra": 4, "cross": 4, "much": [4, 28, 29, 35, 36], "faster": [4, 6, 35, 64], "evalu": 4, "now": [4, 6, 9, 13, 15, 20, 25, 28, 29, 30, 38, 43, 44, 46, 47, 48, 49, 51, 52, 56, 60, 61, 62], "illustr": [4, 6, 7, 35, 36, 56], "purpos": [4, 6, 7, 28, 29, 35, 36, 56], "from": [4, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 37, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65, 66], "link": [4, 6, 7, 22, 25, 26, 27, 46, 48, 49, 60, 61, 62, 64, 65], "scratch": [4, 6, 7, 36, 46, 48, 49, 56, 60, 61, 62, 64, 65], "prune": [4, 6, 7, 26, 30, 31, 40, 42, 44, 45, 57, 58, 59, 60, 62], "statelessx": [4, 6, 7, 42, 44, 45, 57, 58, 59], "initi": [4, 6, 7, 9, 35, 36, 38, 41], "step": [4, 6, 7, 11, 14, 20, 25, 26, 28, 29, 30, 36, 38, 40, 41, 43, 44, 46, 48, 49, 54, 56, 60, 61, 62], "download": [4, 6, 7, 8, 10, 13, 15, 20, 23, 24, 27, 34, 35, 36, 39, 44, 56, 63], "git_lfs_skip_smudg": [4, 6, 7, 28, 29, 30, 31, 35, 36, 56], "huggingfac": [4, 6, 7, 13, 22, 24, 25, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 47, 48, 49, 51, 52, 54, 56, 60, 64, 65], "co": [4, 6, 7, 22, 23, 25, 26, 28, 29, 30, 31, 35, 36, 38, 39, 40, 41, 43, 47, 48, 49, 51, 52, 54, 56, 60, 64, 65], "zengwei": [4, 6, 7, 28, 30, 31, 35, 36, 49, 56, 60, 64], "stateless7": [4, 6, 7, 30, 31], "2022": [4, 6, 7, 26, 28, 29, 30, 31, 40, 46, 48, 49, 60, 61], "12": [4, 6, 7, 9, 14, 20, 25, 26, 28, 29, 30, 31, 35, 38, 40, 41, 43, 46, 48, 49, 51, 54, 60, 61, 62, 64, 65], "29": [4, 6, 7, 20, 25, 30, 31, 38, 40, 41, 43, 47, 48, 51, 52], "exp": [4, 6, 7, 9, 15, 16, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "lf": [4, 6, 7, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 47, 49, 51, 52, 54, 56], "includ": [4, 6, 7, 28, 29, 30, 31, 35, 36, 46, 48, 49, 56, 60, 61, 62], "pt": [4, 6, 7, 9, 11, 15, 20, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "ln": [4, 6, 7, 9, 15, 26, 28, 29, 30, 31, 35, 36, 38, 43, 46, 48, 49, 56, 60, 61, 62], "epoch": [4, 6, 7, 9, 12, 15, 16, 25, 26, 28, 29, 30, 31, 32, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "99": [4, 6, 7, 15, 20, 25, 28, 29, 30, 31, 35, 36], "symbol": [4, 5, 6, 7, 20, 25, 40, 46, 61, 62], "load": [4, 6, 7, 9, 15, 20, 25, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "lang_bpe_500": [4, 6, 7, 26, 28, 29, 30, 31, 32, 33, 35, 36, 43, 46, 48, 49, 56, 60, 61, 62], "bpe": [4, 5, 6, 7, 26, 28, 29, 30, 31, 33, 35, 36, 43, 46, 48, 49, 56, 60, 61, 62], "done": [4, 6, 7, 9, 13, 15, 25, 26, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "via": [4, 6, 7, 14, 25, 27, 32, 33, 34, 35, 36, 56], "exp_dir": [4, 6, 7, 9, 15, 25, 28, 29, 30, 40, 43, 44, 46, 48, 49, 61, 62], "avg": [4, 6, 7, 9, 12, 15, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 40, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "averag": [4, 6, 7, 9, 12, 15, 25, 26, 28, 29, 30, 31, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "fals": [4, 6, 7, 9, 15, 20, 25, 26, 28, 29, 30, 35, 36, 38, 40, 43, 44], "dir": [4, 6, 7, 20, 26, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "max": [4, 6, 7, 25, 26, 28, 29, 35, 36, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62, 64, 65], "durat": [4, 6, 7, 11, 26, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "600": [4, 6, 7, 25, 26, 35, 43, 46, 48, 60, 61, 62], "chunk": [4, 6, 7, 28, 30, 31, 35, 61, 62], "len": [4, 6, 7, 20, 30, 31, 62], "32": [4, 6, 7, 20, 25, 28, 29, 30, 31, 35, 38, 40, 41, 62], "method": [4, 5, 7, 15, 23, 26, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 60, 61, 62], "modified_beam_search": [4, 5, 6, 7, 23, 40, 44, 46, 48, 60, 61, 62], "clean": [4, 9, 15, 25, 30, 35, 38, 40, 43, 44, 46, 47, 48, 49, 60, 61, 62], "beam_size_4": [4, 6, 7], "11": [4, 6, 7, 9, 10, 11, 15, 20, 25, 28, 29, 31, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "best": [4, 5, 6, 7, 28, 29, 30, 35, 36, 38, 41, 43], "7": [4, 6, 7, 9, 20, 25, 26, 27, 30, 34, 38, 41, 43, 46, 47, 51, 52, 60, 61], "93": [4, 6, 7, 15, 20], "Then": [4, 6], "necessari": [4, 44, 56], "note": [4, 5, 6, 7, 10, 11, 15, 17, 20, 26, 28, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "960": [4, 35, 36, 43, 46, 48, 49, 60, 61, 62], "hour": [4, 13, 35, 36, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "ezerhouni": [4, 6, 7], "pushd": [4, 6, 7, 31], "popd": [4, 6, 7, 31], "marcoyang": [4, 6], "librispeech_bigram": [4, 6], "2gram": [4, 6], "fst": [4, 11, 17, 24, 25, 40, 54], "modified_beam_search_lm_lodr": 4, "lm_dir": [4, 6, 7, 9, 25, 43], "lm_scale": [4, 6, 7], "42": [4, 9, 15, 20, 25, 29, 35, 38, 43, 54], "lodr_scal": 4, "24": [4, 9, 10, 13, 15, 20, 25, 28, 29, 41, 47, 51, 52, 54], "modified_beam_search_lodr": [4, 5, 6], "scale": [4, 6, 7, 28, 29, 38, 43, 44, 47, 49, 51, 52], "embed": [4, 6, 7, 40, 46, 56, 60, 61, 62], "dim": [4, 6, 7, 28, 29, 30, 35, 40, 46, 56, 61], "2048": [4, 6, 7, 26, 28, 29, 30, 40, 56], "hidden": [4, 6, 7, 29, 56, 60], "num": [4, 6, 7, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 48, 49, 56, 60, 61, 62, 64, 65], "layer": [4, 6, 7, 28, 29, 30, 35, 40, 44, 46, 56, 58, 60, 61, 62], "vocab": [4, 6, 7, 43], "500": [4, 6, 7, 26, 28, 29, 30, 40, 43, 49, 60, 64, 65], "token": [4, 11, 20, 26, 28, 29, 30, 31, 32, 33, 35, 38, 40, 41, 43, 47, 51, 52, 54, 56, 64, 65], "ngram": [4, 43, 47, 51, 52], "2": [4, 6, 7, 9, 11, 13, 15, 20, 24, 26, 27, 34, 35, 36, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "extra": [4, 28, 29, 30, 40, 58, 61, 63], "argument": [4, 7, 15, 35, 36, 44, 58], "need": [4, 6, 11, 13, 14, 15, 17, 20, 23, 25, 26, 27, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62], "given": [4, 9, 11, 12, 13, 15, 20, 25, 26, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 61, 62, 64, 65], "specifi": [4, 7, 10, 12, 15, 16, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "neg": [4, 40], "number": [4, 7, 16, 23, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "obtain": [4, 7, 38, 40, 41, 43, 47, 51, 52], "shown": [4, 7, 35], "below": [4, 7, 9, 11, 12, 13, 14, 15, 16, 20, 25, 28, 29, 30, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 64], "61": [4, 6, 20], "6": [4, 6, 7, 9, 10, 11, 15, 20, 27, 34, 38, 40, 43, 46, 47, 51, 52, 60, 65], "74": [4, 6, 20, 25, 26], "recal": 4, "lowest": [4, 12, 15, 46, 48, 49, 60, 61, 62], "77": [4, 6, 7, 20, 25, 43], "08": [4, 6, 7, 9, 15, 20, 30, 43, 47, 49, 51, 52, 54, 60], "inde": 4, "even": [4, 23, 25, 29], "better": [4, 6], "increas": [4, 6, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "8": [4, 6, 7, 9, 10, 15, 20, 25, 26, 28, 29, 30, 35, 38, 40, 43, 44, 46, 47, 48, 49, 54, 60, 61, 62], "45": [4, 6, 15, 20, 25, 28, 30, 38, 40, 43], "38": [4, 6, 20, 25, 28, 38, 40, 43, 51], "23": [4, 6, 9, 10, 11, 15, 20, 25, 28, 29, 30, 35, 38, 40, 41, 43, 51, 52, 54], "section": [5, 8, 9, 10, 18, 20, 21, 25, 26, 31, 32, 33, 34, 38, 43], "langugag": 5, "transduc": [5, 24, 26, 27, 31, 34, 35, 36, 39, 42, 44, 45, 56, 57, 58, 59], "rnn": [5, 6, 7, 24, 29, 40, 46, 48, 60, 61, 62, 66], "avail": [5, 6, 8, 15, 24, 25, 26, 28, 29, 30, 36, 37, 38, 40, 43, 47, 51, 52, 54, 60], "beam": [5, 26, 60], "search": [5, 6, 7, 22, 23, 63], "realli": [5, 38, 41, 43, 46, 48, 49, 60, 61, 62], "valu": [5, 7, 28, 29, 30, 35, 36, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "t": [5, 13, 14, 15, 17, 20, 25, 28, 29, 30, 31, 32, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "doe": [5, 15, 17, 20, 28, 29, 30, 38, 40, 43, 54], "modified_beam_search_lm_shallow_fus": [5, 6, 7], "interpol": 5, "also": [5, 6, 7, 11, 13, 14, 15, 21, 22, 25, 26, 27, 28, 29, 30, 31, 33, 35, 38, 40, 41, 43, 46, 48, 49, 54, 56, 58, 60, 61, 62, 64], "known": 5, "bigram": 5, "backoff": 5, "modified_beam_search_lm_rescor": [5, 6], "hypothes": [5, 6], "rnnlm": [5, 6, 56], "re": [5, 6, 10, 38, 41, 43, 44, 46, 48, 49, 58, 60, 61, 62], "rank": [5, 6], "modified_beam_search_lm_rescore_lodr": [5, 6], "lodr": [5, 24, 56], "commonli": [6, 7, 38, 40, 41, 43, 47, 51, 52, 54], "approach": [6, 18, 20, 24], "incorpor": 6, "unlik": 6, "more": [6, 14, 25, 28, 29, 30, 35, 38, 43, 44, 54, 56, 58, 60, 61, 64, 65], "effici": [6, 7, 35, 46, 61, 62], "than": [6, 25, 26, 29, 35, 38, 40, 41, 43, 46, 47, 48, 49, 54, 60, 61, 62], "sinc": [6, 13, 20, 25, 28, 29, 30, 36, 44, 54, 60], "less": [6, 26, 35, 43, 47, 54, 61, 62], "comput": [6, 15, 18, 25, 26, 28, 29, 30, 38, 40, 41, 44, 46, 47, 49, 51, 52, 54, 60, 61, 62], "gpu": [6, 7, 8, 13, 14, 24, 25, 28, 29, 35, 36, 38, 40, 41, 43, 44, 46, 48, 49, 51, 52, 54, 60, 61, 62], "try": [6, 10, 12, 15, 21, 23, 44, 46, 48, 49, 60, 61, 62], "might": [6, 7, 29, 30, 61, 62], "ideal": [6, 7], "mai": [6, 7, 9, 25, 28, 29, 30, 36, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62, 66], "With": [6, 25], "43": [6, 9, 20, 29, 30, 43], "great": 6, "made": [6, 28], "boost": [6, 7], "tabl": [6, 17, 23, 28, 29, 30], "67": [6, 20, 25], "59": [6, 15, 20, 25, 28, 41, 43], "86": [6, 20], "fact": 6, "arpa": [6, 11, 54], "performn": 6, "depend": [6, 14, 15, 17, 25, 38, 43, 63], "kenlm": 6, "kpu": 6, "archiv": [6, 56], "zip": 6, "execut": [6, 7, 13, 28, 35, 38, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "9": [6, 9, 20, 25, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 54, 60, 61, 62], "57": [6, 20, 25, 29, 43, 47], "slightli": 6, "63": [6, 20, 40], "04": [6, 28, 29, 30, 38, 40, 41, 43, 47, 51, 52], "52": [6, 20, 25, 38, 43], "73": [6, 20], "mention": [6, 58], "earlier": 6, "benchmark": [6, 40], "speed": [6, 28, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "132": [6, 20], "95": [6, 20, 39], "177": [6, 25, 26, 29, 30, 40, 41, 43], "96": [6, 20, 25, 35], "210": [6, 51, 52], "262": [6, 7, 15], "62": [6, 7, 20, 25, 43, 47], "65": [6, 7, 20, 25, 28], "352": [6, 7, 43], "58": [6, 7, 10, 20, 25, 43], "488": [6, 7, 28, 29, 30], "400": [6, 9, 39], "610": 6, "870": 6, "156": [6, 15, 20], "203": [6, 15, 26, 43], "255": [6, 29, 30], "160": [6, 15, 20], "263": [6, 9, 15, 25, 29], "singl": [6, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "32g": 6, "v100": [6, 38, 40, 41, 43], "vari": 6, "word": [7, 11, 12, 15, 18, 38, 40, 41, 43, 47, 51, 52, 54, 56], "error": [7, 9, 10, 12, 13, 15, 25, 28, 29, 30, 43], "rate": [7, 12, 20, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "These": [7, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "alreadi": [7, 11, 13, 25, 26, 37], "But": [7, 28, 46, 48, 49, 60, 61, 62], "long": [7, 28, 56, 64, 65], "true": [7, 9, 15, 25, 26, 28, 29, 30, 35, 36, 38, 40, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "either": [7, 15, 23, 38, 40, 41, 43, 61, 62], "choos": [7, 23, 25, 44, 46, 48, 49, 60, 61, 62], "three": [7, 15, 28, 29, 30, 33, 38, 40, 58], "associ": 7, "dimens": [7, 35, 46, 56, 61, 62], "obviou": 7, "rel": [7, 37], "reduct": [7, 15, 25, 28, 29, 48], "around": [7, 36], "A": [7, 14, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 60, 61, 62], "few": [7, 11, 28, 29, 30, 44], "paramet": [7, 14, 26, 28, 29, 30, 32, 35, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 56, 60, 61, 62, 64, 65], "tune": [7, 24, 28, 29, 30, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62, 66], "control": [7, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "too": 7, "small": [7, 35, 36, 37, 40, 51, 52, 54], "fulli": 7, "util": [7, 9, 10, 15, 20, 25, 43], "larg": [7, 13], "domin": 7, "bad": 7, "typic": [7, 35, 38, 40, 41, 43], "activ": [7, 13, 23, 25], "path": [7, 9, 15, 23, 25, 26, 28, 29, 30, 33, 36, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "trade": 7, "off": [7, 28], "accuraci": [7, 28, 29, 37, 39], "larger": [7, 29, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "slower": 7, "built": [8, 9, 25, 64], "imag": [8, 24], "cpu": [8, 12, 13, 14, 15, 16, 17, 20, 24, 25, 26, 28, 29, 30, 32, 38, 46, 48, 49, 54, 61, 62, 64], "still": [8, 28, 29, 30, 37], "introduct": [8, 24, 57, 66], "tag": [8, 24], "cuda": [8, 10, 15, 17, 20, 24, 26, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 60, 61, 62], "enabl": [8, 25, 44], "within": [8, 14, 21, 23, 24, 28, 29], "updat": [8, 28, 29, 30, 35], "host": [9, 26], "hub": [9, 20], "k2fsa": 9, "find": [9, 10, 16, 21, 22, 23, 26, 28, 29, 30, 33, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "dockerfil": 9, "tree": [9, 11, 32, 33, 38, 40, 41, 43, 47, 51, 52, 54, 60], "item": [9, 14, 20], "curl": 9, "registri": 9, "v2": [9, 30, 38, 43], "jq": 9, "give": [9, 11, 15, 20, 40], "someth": [9, 38, 40, 41, 43, 46, 48, 49, 54, 60, 61], "torch2": [9, 13, 15], "cuda12": 9, "cuda11": [9, 10, 25], "torch1": [9, 10, 25], "cuda10": 9, "13": [9, 10, 15, 20, 25, 26, 28, 29, 30, 36, 40, 41, 43, 47, 48, 51], "releas": [9, 15, 25, 26, 28, 29, 30, 38, 40, 43, 64], "torch": [9, 10, 13, 14, 20, 24, 26, 27, 34, 38, 40, 43], "select": [9, 12, 13, 14, 23, 25, 28, 29, 30, 46, 47, 51, 52, 54, 60, 61, 62], "appropri": [9, 25], "combin": [9, 12, 28, 29, 30], "visit": [9, 22, 23, 46, 48, 49, 60, 61, 62, 64, 65], "pkg": 9, "py3": [9, 10, 25], "v1": [9, 38, 41, 43, 47, 51, 52], "current": [9, 23, 28, 29, 40, 44, 58, 60, 61, 62, 64, 65, 66], "ghcr": 9, "alwai": [9, 25, 26], "sudo": [9, 38, 41], "rm": 9, "bin": [9, 13, 25, 28, 29, 30, 38, 43], "bash": 9, "start": [9, 11, 12, 14, 15, 16, 20, 23, 25, 26, 30, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "interfac": 9, "present": [9, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "root": [9, 20, 28, 29, 30, 56], "60c947eac59c": 9, "workspac": 9, "export": [9, 10, 11, 12, 13, 14, 16, 24, 25, 37, 38, 40, 41, 43, 44, 47, 51, 52, 54, 63], "pythonpath": [9, 11, 12, 13, 15, 16, 25, 28, 29, 30], "user": [9, 10, 13, 25], "copi": [9, 20, 25, 58], "switch": [9, 25, 28, 29, 30, 38, 43, 49], "opt": 9, "conda": [9, 10], "lib": [9, 10, 15, 25, 30], "site": [9, 10, 15, 25, 30], "packag": [9, 10, 15, 25, 30, 64, 65], "__init__": [9, 10, 15, 25, 26, 28, 29, 30, 38, 40, 43], "line": [9, 10, 11, 28, 29, 30, 46, 56, 61, 62], "modul": [9, 13, 24, 28, 30, 35, 48, 61], "_k2": [9, 10, 25], "determinizeweightpushingtyp": [9, 10], "importerror": [9, 24], "libcuda": 9, "cannot": [9, 24, 28, 29, 30], "share": [9, 24, 25], "object": [9, 24, 25, 38, 40, 41, 46, 54, 60, 61], "No": [9, 13, 17, 24, 28, 29, 30, 54], "stub": 9, "list": [9, 15, 20, 28, 29, 30, 38, 40, 41, 43, 47, 51, 52], "16": [9, 15, 20, 25, 26, 28, 29, 30, 33, 35, 38, 40, 41, 43, 46, 47, 51, 52, 54, 60, 61, 62], "second": [9, 14, 38, 40, 41, 43, 44, 46, 48, 49, 54, 60, 61, 62], "2023": [9, 15, 25, 28, 29, 30, 35, 36, 48, 56, 65], "01": [9, 11, 15, 25, 28, 40, 41, 43, 44, 48], "02": [9, 11, 25, 26, 28, 29, 30, 35, 40, 43, 46, 52, 60, 61, 64], "06": [9, 15, 25, 26, 28, 35, 36, 41, 43, 47, 54], "info": [9, 15, 25, 26, 28, 29, 30, 35, 38, 40, 41, 43, 47, 51, 52, 54], "264": [9, 25, 30], "posixpath": [9, 15, 25, 28, 29, 30, 40, 43], "lang_dir": [9, 15, 25, 40, 43], "lang_phon": [9, 11, 15, 25, 41, 47, 51, 52, 54], "feature_dim": [9, 15, 25, 26, 28, 29, 30, 38, 40, 43, 54], "search_beam": [9, 15, 25, 38, 43, 54], "20": [9, 14, 15, 20, 25, 26, 28, 30, 35, 36, 38, 40, 41, 43, 46, 47, 51, 52, 54, 56, 61], "output_beam": [9, 15, 25, 38, 43, 54], "min_active_st": [9, 15, 25, 38, 43, 54], "30": [9, 10, 15, 20, 25, 28, 29, 30, 38, 40, 41, 43, 44, 46, 48, 49, 54, 60, 61, 62], "max_active_st": [9, 15, 25, 38, 43, 54], "10000": [9, 15, 25, 38, 43, 54], "use_double_scor": [9, 15, 25, 38, 43, 54], "14": [9, 10, 15, 20, 25, 26, 28, 29, 32, 38, 43, 46, 47, 48, 51, 60, 61, 62], "feature_dir": [9, 15, 25, 43], "fbank": [9, 11, 15, 25, 26, 28, 29, 30, 38, 40, 41, 43, 47, 51, 52, 54], "max_dur": [9, 15, 25, 43], "bucketing_sampl": [9, 15, 25, 43], "num_bucket": [9, 15, 25, 43], "concatenate_cut": [9, 15, 25, 43], "duration_factor": [9, 15, 25, 43], "gap": [9, 15, 25, 43], "on_the_fly_feat": [9, 15, 25, 43], "shuffl": [9, 15, 25, 43], "return_cut": [9, 15, 25, 43], "num_work": [9, 15, 25, 43], "env_info": [9, 15, 25, 26, 28, 29, 30, 38, 40, 43], "sha1": [9, 15, 25, 26, 28, 29, 30, 38, 40, 43], "4c05309499a08454997adf500b56dcc629e35ae5": [9, 25], "date": [9, 15, 25, 26, 28, 29, 30, 38, 40, 43], "tue": [9, 25, 28, 43], "jul": [9, 15, 25], "25": [9, 15, 20, 25, 26, 28, 29, 38, 43, 46, 51, 52, 54, 61], "36": [9, 20, 25, 28, 40, 43, 44], "dev": [9, 10, 15, 25, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "7640d663": 9, "branch": [9, 15, 25, 26, 28, 29, 30, 38, 40, 43, 48], "375520d": 9, "fri": [9, 26], "28": [9, 20, 25, 28, 29, 40, 43, 47, 64], "07": [9, 25, 28, 29, 30, 38, 40, 41, 43], "hostnam": [9, 15, 25, 26, 28, 29, 30, 40], "ip": [9, 15, 25, 26, 28, 29, 30, 40], "172": 9, "17": [9, 20, 25, 26, 28, 29, 30, 38, 43, 51, 52, 60], "401": 9, "lexicon": [9, 11, 15, 18, 25, 38, 40, 41, 43, 44, 46, 48, 49, 54, 60, 61, 62], "168": [9, 15, 20, 25, 47], "compil": [9, 15, 25, 28, 29, 38, 40, 43], "linv": [9, 11, 15, 25, 40, 43, 54], "403": [9, 47], "273": [9, 15, 25, 26, 40], "devic": [9, 15, 20, 25, 26, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 61, 62], "406": [9, 43], "291": [9, 25], "424": 9, "218": [9, 15, 25, 29], "about": [9, 11, 12, 14, 15, 16, 20, 25, 28, 29, 30, 35, 40, 44, 46, 49, 60, 61, 62], "cut": [9, 15, 25, 43], "425": [9, 29, 43], "252": [9, 25], "504": 9, "204": [9, 25, 30, 43], "batch": [9, 15, 17, 25, 28, 29, 30, 38, 40, 41, 43, 46, 48, 49, 56, 60, 61, 62], "process": [9, 15, 17, 25, 26, 28, 29, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "until": [9, 15, 25, 43, 48], "w": [9, 20, 25, 43, 51, 52], "nnpack": 9, "cpp": [9, 28, 32], "53": [9, 15, 20, 25, 30, 38, 46, 47, 52, 60, 61], "could": [9, 28, 29, 30, 35, 36, 37, 38, 41, 56], "reason": [9, 14, 26, 28, 29, 30, 36, 61], "unsupport": 9, "hardwar": 9, "687": 9, "241": [9, 25, 38], "transcript": [9, 15, 18, 25, 38, 39, 40, 41, 43, 46, 47, 51, 52, 60, 61, 62], "store": [9, 11, 15, 25, 43, 56], "recog": [9, 15, 25, 40, 43], "test_set": [9, 15, 25, 54], "688": 9, "564": [9, 15, 25], "240": [9, 15, 25, 38, 54], "ins": [9, 15, 25, 43, 54], "del": [9, 15, 20, 25, 43, 54], "sub": [9, 15, 25, 43, 54], "690": 9, "249": [9, 25, 29], "wrote": [9, 15, 25, 43], "detail": [9, 11, 15, 20, 25, 27, 31, 35, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 58, 60, 61, 62, 64, 65], "stat": [9, 15, 25, 43], "err": [9, 15, 25, 40, 43], "316": [9, 25, 43], "congratul": [9, 13, 20, 25, 28, 29, 30, 38, 41, 43, 47, 51, 52, 54, 64], "finish": [9, 14, 38, 40, 41, 43, 44, 46, 47, 51, 52, 54, 61, 62], "successfulli": [9, 13, 25, 28, 29, 30, 64], "collect": [10, 13, 25, 56], "post": 10, "correspond": [10, 22, 23], "solut": 10, "One": 10, "torchaudio": [10, 13, 18, 20, 24, 58], "cu111": 10, "torchvis": 10, "f": [10, 13, 15, 20, 25, 51, 52, 64], "org": [10, 13, 20, 25, 39, 40, 46, 56, 60, 61, 62], "whl": [10, 13, 25], "torch_stabl": [10, 13, 25], "throw": [10, 28, 29, 30], "while": [10, 16, 25, 28, 29, 30, 35, 38, 40, 41, 43, 44, 46, 48, 49, 56, 60, 61, 62], "That": [10, 11, 14, 15, 16, 17, 28, 29, 44, 46, 60, 61, 62], "cu11": 10, "correct": 10, "traceback": 10, "most": [10, 61, 62], "recent": 10, "last": 10, "yesnoasrdatamodul": 10, "home": [10, 20, 28, 29, 38, 43], "xxx": [10, 20, 26, 28, 29, 30], "next": [10, 13, 14, 23, 25, 28, 29, 30, 43, 44, 46, 47, 48, 49, 56, 60, 61, 62], "gen": [10, 13, 14, 23, 25, 43, 44, 46, 47, 48, 49, 60, 61, 62], "kaldi": [10, 11, 13, 14, 17, 18, 23, 24, 25, 43, 44, 46, 47, 48, 49, 60, 61, 62], "34": [10, 20, 28, 29], "datamodul": 10, "add_eo": 10, "add_so": 10, "get_text": 10, "39": [10, 20, 25, 28, 30, 40, 43, 47, 51], "tensorboard": [10, 16, 25, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "summarywrit": 10, "miniconda3": 10, "env": 10, "yyi": 10, "loosevers": 10, "uninstal": 10, "setuptool": [10, 13, 25], "yangyifan": 10, "anaconda3": 10, "dev20230112": 10, "linux": [10, 13, 14, 23, 25, 27, 28, 29, 30, 31], "x86_64": [10, 25, 28], "egg": 10, "handl": [10, 38, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "except": [10, 26], "anoth": 10, "occur": 10, "pruned_transducer_stateless7_ctc_b": [10, 48], "104": [10, 15, 20, 25], "rais": 10, "anaconda": 10, "maco": [10, 13, 14, 23, 27, 28, 29, 30, 31], "probabl": [10, 40, 46, 48, 60, 61, 62], "variabl": [10, 12, 13, 16, 25, 28, 29, 30, 38, 41, 43, 44, 46, 48, 49, 60, 61, 62], "dyld_library_path": 10, "conda_prefix": 10, "locat": [10, 16, 28], "libpython": 10, "abl": 10, "insid": [10, 33], "codna_prefix": 10, "ld_library_path": 10, "setup": [11, 14, 20, 24, 25, 28, 35, 36, 38, 40, 41, 43, 44, 46, 47, 51, 52, 54, 61, 62, 64, 65], "everyth": [11, 20, 27], "tmp": [11, 12, 13, 15, 16, 25, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64], "each": [11, 15, 18, 26, 28, 29, 31, 35, 38, 40, 41, 43, 46, 48, 49, 56, 58, 60, 61, 62], "exist": 11, "anyth": [11, 21, 23], "els": [11, 20], "wonder": [11, 15], "url": [11, 38, 40, 41, 43, 46, 48, 49, 54, 60, 61], "varieti": 11, "folder": [11, 25, 26, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "wav": [11, 15, 20, 26, 28, 29, 30, 31, 33, 38, 40, 41, 43, 46, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "scp": 11, "feat": 11, "put": [11, 13, 25, 28, 29, 48, 61], "l": [11, 20, 25, 28, 29, 30, 40, 51, 52, 54], "waves_yesno": [11, 15, 25], "tar": [11, 25, 64], "gz": [11, 25, 56], "l41": 11, "extract": [11, 25, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "yesno_cuts_test": 11, "jsonl": [11, 26], "yesno_cuts_train": 11, "yesno_feats_test": 11, "lca": 11, "yesno_feats_train": 11, "hlg": [11, 15, 25, 47, 51, 52, 54], "l_disambig": [11, 54], "lexicon_disambig": [11, 20, 54], "manifest": [11, 25, 35, 36, 44], "yesno_recordings_test": 11, "yesno_recordings_train": 11, "yesno_supervisions_test": 11, "yesno_supervisions_train": 11, "18": [11, 20, 25, 28, 29, 30, 38, 40, 41, 43, 46, 47, 51, 52, 60, 61, 62], "thei": [11, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "idea": [11, 15, 20, 58], "examin": 11, "relat": [11, 18, 26, 35, 36, 38, 40, 43, 47, 51, 52, 54, 64, 65], "gunzip": 11, "c": [11, 17, 20, 25, 40, 41, 46, 48, 49, 54, 60, 61, 62, 64], "head": [11, 20, 25, 35, 40, 58], "output": [11, 12, 13, 15, 20, 26, 28, 29, 30, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64], "id": [11, 38, 41, 43, 47, 51, 52], "0_0_0_0_1_1_1_1": 11, "channel": [11, 23, 25, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "sampling_r": 11, "num_sampl": 11, "50800": 11, "35": [11, 20, 25, 26, 28, 29, 30, 40, 43, 60], "channel_id": 11, "0_0_0_1_0_1_1_0": 11, "48880": 11, "0_0_1_0_0_1_1_0": 11, "48160": 11, "audio": [11, 20, 25, 51, 52, 64], "l300": 11, "mean": [11, 14, 15, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62], "field": [11, 39], "per": [11, 40, 46, 61, 62], "recording_id": 11, "NO": [11, 15, 54], "ye": [11, 15, 17, 54], "hebrew": [11, 54], "supervis": [11, 24, 37, 66], "l510": 11, "furthermor": [11, 40], "featur": [11, 17, 25, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62], "compress": [11, 25], "lilcom": [11, 25], "cutset": [11, 36], "recordingset": 11, "supervisionset": 11, "featureset": 11, "num_fram": [11, 20], "635": 11, "num_featur": 11, "frame_shift": 11, "storage_typ": 11, "lilcom_chunki": 11, "storage_path": 11, "storage_kei": 11, "13000": 11, "3570": 11, "record": [11, 23, 29, 30, 38, 39, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "monocut": 11, "611": 11, "16570": 11, "12964": 11, "2929": 11, "602": 11, "32463": 11, "12936": 11, "2696": 11, "actual": [11, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "separ": [11, 31, 56], "lang": [11, 20, 25, 26, 40, 43, 49], "quit": [12, 14, 16, 37, 38, 40, 41, 43, 46, 48, 49, 56, 60, 61, 62], "cuda_visible_devic": [12, 16, 25, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "usag": [12, 15, 16, 26, 28, 29, 30, 32, 33, 47, 51, 52, 54, 63], "one": [12, 23, 26, 28, 29, 30, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64], "tini": [13, 14], "well": [13, 26, 35, 54, 66], "hundr": 13, "thousand": 13, "virtualenv": [13, 25], "icefall_env": [13, 15], "interpret": 13, "usr": 13, "prefix": [13, 26], "pkg_resourc": 13, "wheel": [13, 25, 28], "remeb": 13, "continu": [13, 15, 20, 28, 29, 30, 31, 38, 40, 41, 43, 46, 48, 49, 54, 60, 61], "caution": [13, 38, 43], "matter": [13, 25, 28], "torchaduio": 13, "from_wheel": [13, 15, 25], "dev20231220": 13, "china": [13, 25, 39], "\u4e2d\u56fd\u56fd\u5185\u7528\u6237": [13, 25], "\u5982\u679c\u8bbf\u95ee\u4e0d\u4e86": [13, 25], "\u8bf7\u4f7f\u7528": [13, 25], "cn": [13, 25], "anytim": 13, "modulenotfounderror": 13, "don": [13, 14, 15, 17, 20, 25, 28, 29, 30, 32, 36, 38, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "walk": [14, 20], "recognit": [14, 23, 24, 27, 28, 29, 37, 39, 40, 54, 66], "system": [14, 56], "out": [14, 44, 56], "minut": [14, 56], "sequenti": 14, "part": [14, 15, 23, 25, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62, 64, 65], "window": [14, 23, 27, 28, 29, 30, 31], "commun": 14, "appreci": 14, "virtual": 14, "curiou": 14, "quick": 14, "state_dict": [14, 24, 34, 38, 40, 41, 43, 47, 51, 52, 54], "jit": [14, 24, 27, 34, 43], "onnx": [14, 24, 26, 34, 35, 63, 65], "torchscript": [15, 27, 32, 33, 34], "trace": [15, 24, 27, 32, 34], "explain": 15, "kind": [15, 43, 46, 48, 49, 60, 61, 62], "produc": [15, 27, 46, 48, 49, 60, 61, 62], "03": [15, 25, 26, 29, 35, 40, 43, 51, 52, 60, 64], "912": [15, 26], "76": [15, 20, 25, 54], "lr": [15, 25, 35, 36, 40, 60], "weight_decai": [15, 25], "1e": [15, 25], "start_epoch": [15, 25], "best_train_loss": [15, 25, 26, 28, 29, 30], "inf": [15, 25, 26, 28, 29, 30], "best_valid_loss": [15, 25, 26, 28, 29, 30], "best_train_epoch": [15, 25, 26, 28, 29, 30], "best_valid_epoch": [15, 25, 26, 29, 30], "batch_idx_train": [15, 25, 26, 28, 29, 30], "log_interv": [15, 25, 26, 28, 29, 30], "reset_interv": [15, 25, 26, 28, 29, 30], "valid_interv": [15, 25, 26, 28, 29, 30], "beam_siz": [15, 25, 26, 40], "sum": [15, 20, 25], "913": 15, "950": 15, "971": [15, 52], "106": [15, 20, 25, 29, 43], "Not": 15, "974": 15, "111": [15, 20, 25, 43], "kei": [15, 28, 29, 30, 43], "bia": 15, "running_mean": 15, "running_var": 15, "num_batches_track": 15, "output_linear": 15, "48": [15, 20, 25, 28, 29, 35, 38, 40], "089": 15, "090": 15, "ad79f1c699c684de9785ed6ca5edb805a41f78c3": 15, "wed": [15, 25, 28, 38, 40, 43], "26": [15, 20, 25, 28, 29, 30, 40, 43, 52], "09": [15, 26, 29, 38, 40, 41, 43, 60], "aa073f6": 15, "none": [15, 20, 25, 38, 43], "9a47c08": 15, "mon": [15, 29, 30], "aug": [15, 44], "50": [15, 20, 25, 26, 28, 29, 30, 43, 46, 51, 60, 61, 62], "privat": 15, "fangjun": [15, 19, 25, 26, 28, 29, 30, 40, 43], "macbook": 15, "pro": [15, 38, 43], "127": [15, 20, 25, 28, 29, 54], "092": 15, "103": [15, 20], "272": 15, "109": [15, 20, 25, 38, 43], "112": [15, 20, 28, 29, 30], "115": [15, 20, 28, 29, 38, 43], "253": 15, "386": 15, "556": 15, "557": 15, "558": 15, "248": [15, 40], "559": 15, "315": [15, 28, 38, 40, 41, 43, 47], "ident": [15, 20], "kaldifeat": 15, "csukuangfj": [15, 25, 26, 28, 29, 31, 38, 40, 41, 43, 47, 51, 52, 54, 60, 64], "dev20231221": 15, "0_0_0_1_0_0_0_1": [15, 54], "0_0_1_0_0_0_1_0": [15, 54], "19": [15, 20, 26, 28, 29, 30, 35, 36, 38, 43, 47, 51, 52], "208": [15, 43], "136": [15, 20, 43], "num_class": [15, 38, 43, 54], "sample_r": [15, 20, 26, 38, 40, 43, 54], "words_fil": [15, 38, 43, 54], "sound_fil": [15, 26, 38, 40, 43, 54], "142": [15, 20, 28, 38, 41, 43], "144": [15, 20, 43], "212": 15, "213": [15, 54], "construct": [15, 20, 26, 28, 29, 30, 38, 40, 41, 43, 47, 51, 52, 54], "170": [15, 47], "sound": [15, 26, 28, 29, 30, 33, 34, 38, 40, 41, 43, 47, 51, 52, 54], "224": 15, "176": [15, 28, 40, 43], "304": [15, 29], "214": [15, 40, 43], "47": [15, 20, 25, 28, 29, 30, 36, 38, 43], "44": [15, 20, 25, 28, 29, 35, 43, 51, 52], "666": 15, "667": 15, "670": 15, "677": [15, 28], "100": [15, 20, 25, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "843": 15, "cpu_jit": [15, 32, 38, 43, 46, 48, 49, 61, 62], "confus": [15, 32], "move": [15, 32, 46, 48, 49, 61, 62], "map_loc": 15, "resid": 15, "default": [15, 28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "jit_pretrain": [15, 33, 48, 49, 60], "nn": [15, 40, 46, 48, 49, 60, 61, 62], "56": [15, 20, 25, 28, 29, 43, 51], "00": [15, 25, 28, 38, 40, 41, 43, 47, 51, 52, 54], "603": 15, "121": [15, 20, 47], "nn_model": [15, 38, 43], "129": [15, 20, 41], "640": [15, 25, 30], "134": [15, 20, 38], "641": 15, "138": [15, 20, 38, 40], "148": [15, 20, 35], "642": 15, "154": [15, 20, 41], "727": 15, "190": [15, 47], "192": [15, 30, 35, 43], "export_onnx": 15, "onnxruntim": [15, 31], "888": [15, 38], "83": [15, 20, 43, 47], "892": 15, "diagnost": 15, "verbos": 15, "warn": 15, "21": [15, 20, 25, 26, 28, 35, 38, 40, 43, 51, 52], "047": [15, 40], "meta_data": 15, "model_typ": 15, "model_author": 15, "comment": 15, "non": [15, 24, 43, 58, 61, 66], "vocab_s": [15, 26, 28, 29, 30, 40], "049": 15, "140": [15, 20, 25, 41], "int8": [15, 27, 34, 65], "quantiz": [15, 27, 34, 44], "075": 15, "onnx_quant": 15, "538": [15, 43], "tensor": [15, 25, 29, 30, 38, 40, 41, 43, 46, 54, 60, 61], "transpose_1_output_0": 15, "081": 15, "151": [15, 20, 28], "float32": [15, 28, 29, 30], "onnx_pretrain": [15, 31], "260": [15, 30, 43], "166": [15, 20], "171": [15, 25, 41, 43, 51, 52], "173": 15, "267": [15, 29, 40, 51, 52], "270": 15, "180": [15, 29, 38, 43], "279": [15, 43], "196": 15, "318": [15, 28, 29], "232": 15, "234": [15, 43], "deploi": [15, 31, 38, 43], "sherpa": [15, 23, 27, 32, 33, 34, 60, 63], "framework": [15, 23, 46, 61], "_": [15, 20, 44], "ncnn": [15, 24, 34], "forc": [17, 24], "align": [17, 24, 63], "instead": [17, 30, 40, 61], "support": [17, 20, 25, 27, 28, 29, 30, 38, 40, 43, 46, 48, 49, 58, 60, 61, 62, 64, 65], "api": [17, 18, 20], "ctc": [18, 20, 39, 42, 45, 49, 50, 53], "loss": [18, 20, 25, 28, 29, 38, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "log_prob": [18, 46, 61, 62], "token2id": 18, "id2token": 18, "word2id": 18, "id2word": 18, "convert": [18, 25, 28, 29, 30, 43], "graph": [18, 38, 41, 43, 46, 47, 51, 52, 61, 62], "segment": [18, 25], "summari": 18, "todo": 19, "through": [20, 39], "sure": [20, 28, 29, 30], "NOT": [20, 38, 40, 43, 54], "wave": [20, 26, 28, 29, 30, 38, 43], "speech_fil": 20, "download_asset": 20, "asset": 20, "lab41": 20, "sri": 20, "voic": 20, "src": [20, 28, 30], "sp0307": 20, "ch127535": 20, "sg0042": 20, "waveform": 20, "sr": 20, "had": [20, 26, 43, 47], "curios": 20, "besid": 20, "me": 20, "moment": [20, 23], "split": [20, 44], "shape": [20, 25, 30], "assert": [20, 30], "ndim": 20, "16000": [20, 26, 38, 40, 41, 43, 47, 48, 51, 52], "cach": [20, 25, 30], "filenam": [20, 25, 28, 29, 30, 31, 32, 33, 48, 49, 60, 62, 64, 65], "content": [20, 28, 29, 30], "element": [20, 30], "bundl": [20, 25], "pipelin": 20, "mms_fa": 20, "is_avail": 20, "get_model": 20, "with_star": 20, "inference_mod": 20, "emiss": 20, "size": [20, 25, 26, 28, 29, 30, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "169": [20, 43, 51, 52], "get_dict": 20, "star": [20, 25, 28, 29, 30], "ep": 20, "dict": [20, 26, 30], "enumer": [20, 25], "prepare_lang": 20, "add_disambig_symbol": 20, "max_disambig_id": 20, "encod": [20, 27, 31, 33, 34, 35, 38, 40, 41, 43, 46, 47, 48, 54, 58, 60, 61, 62], "utf": 20, "join": 20, "k": [20, 30, 46, 51, 52, 60, 61, 62], "rang": 20, "o": [20, 25], "b": [20, 40, 43, 51, 52], "d": [20, 51, 52, 56], "h": [20, 25], "15": [20, 25, 26, 28, 29, 30, 35, 36, 40, 41, 43, 51, 54, 56], "v": [20, 28, 29, 30, 43, 51, 52], "j": [20, 28, 29, 38, 43], "z": [20, 51, 52], "q": 20, "27": [20, 25, 28, 29, 30, 35, 36, 38, 40, 47, 52], "charact": [20, 40], "unit": [20, 40], "prepare_lang_fst": 20, "hl": 20, "rw": [20, 28, 29, 30], "13k": 20, "jun": 20, "7k": 20, "kaldi_decod": 20, "decodablectc": 20, "fasterdecod": 20, "fasterdecoderopt": 20, "kaldifst": 20, "def": 20, "force_align": 20, "stdvectorfst": 20, "contigu": 20, "numpi": [20, 25], "decoder_opt": 20, "max_act": 20, "3000": [20, 26, 28, 29, 30], "reached_fin": 20, "return": 20, "ok": 20, "best_path": 20, "get_best_path": 20, "isymbols_out": 20, "osymbols_out": 20, "total_weight": 20, "get_linear_symbol_sequ": 20, "linear": [20, 28, 29, 40], "sequenc": [20, 61, 62], "increment": [20, 28, 29, 30], "main": [20, 25, 38, 43, 58], "ctc_forced_alignment_api_tutori": 20, "frame": [20, 25, 35, 40, 46, 48, 61, 62], "eas": [20, 28, 29, 30], "31": [20, 28, 29, 30, 35, 43], "33": [20, 25, 28, 29, 38, 39, 40, 43, 51], "37": [20, 29, 38, 40, 43, 51], "40": [20, 25, 28, 29, 30, 41, 43, 47, 51, 52], "41": [20, 25, 28, 30, 38, 40, 51, 54], "46": [20, 25, 29, 38, 43], "49": [20, 25, 28, 29, 43, 52, 54], "51": [20, 25, 28, 38, 43, 54], "54": [20, 25, 29, 30, 43, 47, 51, 52], "55": [20, 25, 28, 41, 43, 51], "60": 20, "64": [20, 25, 26, 28, 35, 40, 61], "66": [20, 25, 29, 36], "68": [20, 25, 43], "69": [20, 30], "70": [20, 25], "71": [20, 25, 43, 47], "72": [20, 40, 43], "75": [20, 25, 28], "78": 20, "79": [20, 25], "80": [20, 26, 28, 29, 30, 38, 40, 43], "81": 20, "82": 20, "84": [20, 29, 38], "85": 20, "87": [20, 25, 28], "88": [20, 28, 40], "89": [20, 25, 38], "90": [20, 25, 28], "91": [20, 25], "92": [20, 25, 43], "94": 20, "97": [20, 25, 28, 38], "98": [20, 38], "101": [20, 29], "102": [20, 30, 38], "105": [20, 43], "107": [20, 29, 47], "108": 20, "110": [20, 43], "113": [20, 40, 43], "114": 20, "116": 20, "117": [20, 43], "118": [20, 25, 43], "119": [20, 54], "120": 20, "122": [20, 43], "123": 20, "124": [20, 25, 38, 43], "125": [20, 43, 54], "126": [20, 43], "128": [20, 35, 43], "130": 20, "131": [20, 38, 43], "133": [20, 30], "135": [20, 43, 54], "137": 20, "139": [20, 54], "141": [20, 28], "143": [20, 54], "145": 20, "146": [20, 25], "147": [20, 29, 30], "149": [20, 25, 28, 43], "150": [20, 38, 43], "152": 20, "153": [20, 43, 54], "155": 20, "157": [20, 25], "158": [20, 29], "159": [20, 29, 43, 54], "161": [20, 41, 43], "162": [20, 43], "163": [20, 40, 43], "164": 20, "165": [20, 38, 43], "167": [20, 25], "merg": 20, "merge_token": 20, "token_span": 20, "span": 20, "end": [20, 40, 46, 48, 49, 54, 60, 61, 62, 64, 65], "unflatten": 20, "list_": 20, "length": [20, 28, 30, 40, 56, 61, 62], "ret": 20, "append": 20, "word_span": 20, "tokenspan": 20, "preview_word": 20, "x0": 20, "int": [20, 38, 43], "x1": 20, "3f": 20, "sec": 20, "ipython": 20, "displai": [20, 38, 40, 41, 43], "along": 20, "stamp": [20, 40], "644": 20, "664": 20, "704": [20, 25, 38, 51], "845": 20, "885": 20, "026": [20, 30], "086": 20, "790": 20, "871": 20, "314": [20, 25], "334": 20, "414": 20, "495": [20, 25], "575": 20, "595": [20, 29], "756": 20, "837": 20, "repost": 20, "whole": [20, 35, 36, 43, 47, 51, 52, 61, 62], "youtub": [21, 24, 43, 44, 46, 47, 48, 49, 60, 61, 62], "video": [21, 24, 43, 44, 46, 47, 48, 49, 60, 61, 62], "upload": [22, 23, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "specif": [22, 31, 40], "aishel": [22, 24, 38, 40, 41, 42, 66], "wenetspeech": [22, 32], "ipad": 23, "phone": 23, "screenshot": [23, 38, 40, 41, 43, 44, 46, 54, 60, 61], "chines": [23, 39, 40], "english": [23, 36, 54, 60], "greedi": 23, "click": [23, 25, 38, 40, 41, 43, 46, 48, 49, 54, 60, 61], "button": 23, "submit": 23, "wait": 23, "bottom": [23, 46, 48, 49, 60, 61, 62], "subscrib": [23, 25, 43, 44, 46, 47, 48, 49, 60, 61, 62], "nadira": [23, 25, 43, 44, 46, 47, 48, 49, 60, 61, 62], "povei": [23, 25, 43, 44, 46, 47, 48, 49, 60, 61, 62], "www": [23, 25, 39, 43, 44, 46, 47, 48, 49, 56, 60, 61, 62], "uc_vaumpkminz1pnkfxan9mw": [23, 25, 43, 44, 46, 47, 48, 49, 60, 61, 62], "dummi": [24, 43], "toolkit": 24, "cudnn": 24, "docker": [24, 25], "frequent": 24, "ask": [24, 64], "question": 24, "faq": 24, "oserror": 24, "libtorch_hip": 24, "attributeerror": 24, "distutil": 24, "attribut": [24, 30, 43], "libpython3": 24, "timit": [24, 42, 51, 52, 66], "tt": [24, 64, 65, 66], "vit": [24, 63, 66], "ljspeech": [24, 63, 66], "vctk": [24, 63, 66], "fine": [24, 44, 66], "finetun": [24, 37, 66], "zipform": [24, 27, 31, 34, 37, 42, 45, 56, 57, 59, 66], "adapt": [24, 37, 66], "contribut": 24, "guid": 25, "suggest": [25, 36, 46, 48, 49, 60, 61, 62], "strongli": 25, "point": [25, 26, 38, 41, 43, 44, 46, 48, 49, 60, 61, 62], "sever": [25, 26, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 58, 60, 61, 62], "just": [25, 28, 29, 30, 56, 58], "kuangfangjun": [25, 28, 29, 30], "cpython3": 25, "final": [25, 26, 28, 29, 43, 47], "9422m": 25, "creator": 25, "cpython3posix": 25, "dest": 25, "fj": [25, 26, 28, 29, 30, 40, 43], "clear": 25, "no_vcs_ignor": 25, "global": 25, "seeder": 25, "fromappdata": 25, "app_data_dir": 25, "ad": [25, 28, 29, 30, 38, 40, 41, 43, 46, 48, 49, 54, 58, 60, 61, 62], "seed": 25, "bashactiv": 25, "cshellactiv": 25, "fishactiv": 25, "nushellactiv": 25, "powershellactiv": 25, "pythonactiv": 25, "determin": 25, "nvidia": [25, 38, 40, 41, 43], "smi": 25, "510": 25, "driver": 25, "greater": 25, "our": [25, 28, 29, 30, 32, 33, 43, 44, 46, 58, 61, 62], "case": [25, 26, 28, 29, 30, 37, 46, 48, 49, 60, 61, 62], "verifi": 25, "nvcc": 25, "copyright": 25, "2005": 25, "2019": 25, "corpor": 25, "wed_oct_23_19": 25, "38_pdt_2019": 25, "v10": 25, "cu116": 25, "compat": 25, "stabl": 25, "matrix": 25, "2bcu116": 25, "cp38": 25, "linux_x86_64": 25, "1983": 25, "mb": [25, 28, 29, 30], "________________________________________": 25, "gb": [25, 40], "764": 25, "kb": [25, 28, 29, 30, 51, 52], "eta": 25, "satisfi": 25, "extens": 25, "__version__": 25, "dev20230725": 25, "pypi": 25, "tuna": 25, "tsinghua": 25, "edu": 25, "resolv": 25, "ubuntu": [25, 28, 29, 30], "2bcuda11": 25, "manylinux_2_17_x86_64": 25, "manylinux2014_x86_64": 25, "graphviz": 25, "de": [25, 26, 28, 29, 30, 40], "5e": 25, "fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa": 25, "cento": 25, "2009": 25, "core": 25, "cmake": [25, 28, 29, 38, 43], "gcc": 25, "cmake_cuda_flag": 25, "wno": 25, "deprec": [25, 40], "lineinfo": 25, "expt": 25, "extend": 25, "lambda": 25, "use_fast_math": 25, "xptxa": 25, "gencod": 25, "arch": 25, "compute_35": 25, "sm_35": 25, "compute_50": 25, "sm_50": 25, "compute_60": 25, "sm_60": 25, "compute_61": 25, "sm_61": 25, "compute_70": 25, "sm_70": 25, "compute_75": 25, "sm_75": 25, "compute_80": 25, "sm_80": 25, "compute_86": 25, "sm_86": 25, "donnx_namespac": 25, "onnx_c2": 25, "compute_52": 25, "sm_52": 25, "xcudaf": 25, "diag_suppress": 25, "cc_clobber_ignor": 25, "integer_sign_chang": 25, "useless_using_declar": 25, "set_but_not_us": 25, "field_without_dll_interfac": 25, "base_class_has_different_dll_interfac": 25, "dll_interface_conflict_none_assum": 25, "dll_interface_conflict_dllexport_assum": 25, "implicit_return_from_non_void_funct": 25, "unsigned_compare_with_zero": 25, "declared_but_not_referenc": 25, "bad_friend_decl": 25, "relax": 25, "constexpr": 25, "d_glibcxx_use_cxx11_abi": 25, "option": [25, 27, 31, 34, 40, 44, 47, 51, 52, 54], "wall": 25, "strict": [25, 30, 39], "overflow": 25, "unknown": 25, "pragma": 25, "cmake_cxx_flag": 25, "unus": 25, "nvtx": 25, "disabl": [25, 26, 28, 29], "debug": 25, "sync": 25, "kernel": [25, 28, 30, 35, 40], "memori": [25, 28, 35, 38, 40, 43, 58], "alloc": 25, "214748364800": 25, "byte": [25, 28, 29, 30], "200": [25, 26, 28, 29, 30, 38, 43, 44, 51, 52, 54], "abort": 25, "__file__": 25, "cpython": [25, 28], "gnu": [25, 28], "req": 25, "vq12fd5i": 25, "filter": 25, "quiet": [25, 39], "7640d663469b22cd0b36f3246ee9b849cd25e3b7": 25, "metadata": [25, 51, 52], "pyproject": 25, "toml": 25, "cytoolz": 25, "3b": 25, "a7828d575aa17fb7acaf1ced49a3655aa36dad7e16eb7e6a2e4df0dda76f": 25, "pyyaml": 25, "c8": 25, "6b": 25, "6600ac24725c7388255b2f5add93f91e58a5d7efaf4af244fdbcc11a541b": 25, "ma": 25, "nylinux_2_17_x86_64": 25, "736": 25, "dataclass": 25, "2f": 25, "1095cdc2868052dd1e64520f7c0d5c8c550ad297e944e641dbf1ffbb9a5d": 25, "dev0": 25, "7640d66": 25, "a8": 25, "df0a69c52bd085ca1ad4e5c4c1a5c680e25f9477d8e49316c4ff1e5084a4": 25, "linux_2_17_x86_64": 25, "tqdm": 25, "e6": 25, "a2cff6306177ae6bc73bc0665065de51dfb3b9db7373e122e2735faf0d97": 25, "audioread": 25, "5d": 25, "cb": 25, "82a002441902dccbe427406785db07af10182245ee639ea9f4d92907c923": 25, "377": 25, "tabul": 25, "4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854": 25, "1a": 25, "e63223f8116931d365993d4a6b7ef653a4d920b41d03de7c59499962821f": 25, "ab": [25, 46, 60, 61, 62], "c3": 25, "57f0601a2d4fe15de7a553c00adbc901425661bf048f2a22dfc500caf121": 25, "intervaltre": 25, "fb": 25, "396d568039d21344639db96d940d40eb62befe704ef849b27949ded5c3bb": 25, "soundfil": 25, "bd": 25, "0602167a213d9184fc688b1086dc6d374b7ae8c33eccf169f9b50ce6568c": 25, "py2": 25, "toolz": 25, "7f": 25, "5c": 25, "922a3508f5bda2892be3df86c74f9cf1e01217c2b1f8a0ac4841d903e3e9": 25, "sortedcontain": 25, "9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621c": 25, "cffi": 25, "b7": 25, "8b": 25, "06f30caa03b5b3ac006de4f93478dbd0239e2a16566d81a106c322dc4f79": 25, "442": 25, "pycpars": 25, "d5": 25, "5f610ebe421e85889f2e55e33b7f9a6795bd982198517d912eb1c76e1a53": 25, "687627": 25, "sha256": 25, "cbf0a4d2d0b639b33b91637a4175bc251d6a021a069644ecb1a9f2b3a83d072a": 25, "ephem": 25, "wwtk90_m": 25, "7a": 25, "8e": 25, "a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f": 25, "23704": 25, "5e2d3537c96ce9cf0f645a654c671163707bf8cb8d9e358d0e2b0939a85ff4c2": 25, "9c": 25, "f19ae5a03f8862d9f0776b0c0570f1fdd60a119d90954e3f39": 25, "26098": 25, "2604170976cfffe0d2f678cb1a6e5b525f561cd50babe53d631a186734fec9f9": 25, "f3": 25, "ed": 25, "2b": 25, "c179ebfad4e15452d6baef59737f27beb9bfb442e0620f7271": 25, "remot": 25, "12942": 25, "count": 25, "total": [25, 29, 30, 35, 38, 40, 41, 43, 44, 46, 47, 54, 60, 61], "delta": 25, "reus": 25, "pack": [25, 56, 61, 62], "12875": 25, "receiv": 25, "mib": 25, "8835": 25, "dl_dir": [25, 38, 41, 43, 44, 46, 48, 49, 60, 61, 62], "___________________________________________________": 25, "70m": 25, "1mb": 25, "718": 25, "compute_fbank_yesno": 25, "_______________________________________________________________________________": 25, "82it": 25, "778": 25, "______________________________________________________________________________": 25, "256": [25, 30, 35, 51, 52], "92it": 25, "project": 25, "kaldilm": 25, "csrc": [25, 43], "arpa_file_pars": 25, "cc": 25, "void": 25, "arpafilepars": 25, "std": 25, "istream": 25, "275": [25, 38], "compile_hlg": 25, "276": 25, "309": 25, "ctc_topo": 25, "max_token_id": 25, "310": 25, "intersect": [25, 46, 61, 62], "323": 25, "lg": [25, 46, 49, 61, 62], "connect": [25, 26, 35, 43, 46, 47, 60, 61, 62], "class": [25, 43], "341": 25, "rag": 25, "raggedtensor": 25, "remov": [25, 38, 40, 41, 43, 47, 51, 52], "disambigu": 25, "354": 25, "remove_epsilon": 25, "445": 25, "arc": 25, "compos": 25, "446": 25, "447": 25, "fault": 25, "dump": 25, "protocol_buffers_python_implement": 25, "674": 25, "interest": [25, 44, 46, 48, 49, 60, 61, 62], "936": 25, "481": 25, "482": 25, "world_siz": [25, 44], "master_port": 25, "12354": 25, "num_epoch": 25, "3fb0a43": 25, "thu": [25, 26, 28, 29, 30, 40, 43, 47], "05": [25, 26, 28, 29, 35, 36, 38, 40, 41, 43, 52, 56, 65], "74279": [25, 26, 28, 29, 30, 40], "1220091118": 25, "57c4d55446": 25, "sph26": 25, "941": 25, "949": 25, "965": [25, 38], "244": 25, "967": 25, "199": [25, 43, 47], "singlecutsampl": 25, "205": [25, 43], "968": 25, "565": [25, 43], "422": 25, "065": 25, "over": [25, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "2436": 25, "tot_loss": 25, "681": [25, 28], "4561": 25, "2828": 25, "7076": 25, "22192": 25, "444": 25, "9002": 25, "18067": 25, "011": 25, "2555": 25, "2695": 25, "484": 25, "34971": 25, "331": [25, 28, 29, 43, 47], "4688": 25, "368": 25, "633": 25, "2532": 25, "242": [25, 38, 43], "1139": 25, "1592": 25, "522": [25, 43], "1627": 25, "209": [25, 47], "07055": 25, "1175": 25, "07091": 25, "847": 25, "07731": 25, "427": [25, 29, 43], "04391": 25, "05341": 25, "884": 25, "04384": 25, "387": [25, 52], "03458": 25, "04616": 25, "707": [25, 38, 43], "03379": 25, "758": [25, 43], "433": [25, 43], "01054": 25, "980": [25, 43], "009014": 25, "009974": 25, "489": [25, 38], "01085": 25, "258": [25, 51, 52], "01172": 25, "01055": 25, "621": [25, 54], "01074": 25, "699": 25, "866": 25, "01044": 25, "844": 25, "008942": 25, "221": [25, 43], "01082": 25, "970": [25, 43], "01169": 25, "247": 25, "01073": 25, "326": [25, 29], "555": 25, "840": 25, "841": 25, "855": 25, "868": 25, "882": 25, "883": 25, "701": 25, "702": [25, 43], "fun": [25, 28, 29], "variou": [25, 31, 34, 66], "period": [26, 28], "disk": 26, "optim": [26, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "resum": [26, 35, 36, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "strip": 26, "reduc": [26, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "pruned_transducer_stateless3": [26, 32, 58], "almost": [26, 46, 58, 61, 62], "stateless3": [26, 28], "repo": [26, 31], "those": 26, "iter": [26, 28, 29, 30, 33, 46, 48, 49, 60, 61, 62], "1224000": 26, "greedy_search": [26, 35, 36, 40, 46, 48, 60, 61, 62], "test_wav": [26, 28, 29, 30, 31, 38, 40, 41, 43, 47, 51, 52, 54], "1089": [26, 28, 29, 30, 31, 43, 47], "134686": [26, 28, 29, 30, 31, 43, 47], "0001": [26, 28, 29, 30, 31, 43, 47], "1221": [26, 28, 29, 43, 47], "135766": [26, 28, 29, 43, 47], "0002": [26, 28, 29, 43, 47], "multipl": [26, 38, 40, 41, 43, 47, 51, 52, 54], "Its": [26, 28, 29, 30, 43], "233": [26, 28, 29], "265": 26, "subsampling_factor": [26, 29, 30, 38, 40, 43], "encoder_dim": [26, 28, 29, 30], "512": [26, 28, 29, 30, 35, 38, 40, 43], "nhead": [26, 28, 30, 38, 40, 43, 46, 61], "dim_feedforward": [26, 28, 29, 40], "num_encoder_lay": [26, 28, 29, 30, 40], "decoder_dim": [26, 28, 29, 30], "joiner_dim": [26, 28, 29, 30], "model_warm_step": [26, 28, 29], "4810e00d8738f1a21278b0156a42ff396a2d40ac": 26, "oct": [26, 43], "miss": [26, 28, 29, 30, 40, 43], "cu102": [26, 28, 29, 30], "1013": 26, "c39cba5": 26, "dirti": [26, 28, 29, 38, 43], "ceph": [26, 38, 40, 43], "0324160024": 26, "65bfd8b584": 26, "jjlbn": 26, "bpe_model": [26, 28, 29, 30, 43], "max_context": 26, "max_stat": 26, "context_s": [26, 28, 29, 30, 40], "max_sym_per_fram": [26, 40], "simulate_stream": 26, "decode_chunk_s": 26, "left_context": 26, "dynamic_chunk_train": 26, "causal_convolut": 26, "short_chunk_s": [26, 30, 61, 62], "num_left_chunk": [26, 30], "blank_id": [26, 28, 29, 30, 40], "unk_id": 26, "271": [26, 29], "612": 26, "458": 26, "giga": [26, 29, 60], "623": 26, "277": 26, "78648040": 26, "951": [26, 43], "285": [26, 40, 43], "952": 26, "295": [26, 38, 40, 41, 43], "957": 26, "301": [26, 43], "700": 26, "329": [26, 29, 43], "388": 26, "earli": [26, 28, 29, 30, 43, 47], "nightfal": [26, 28, 29, 30, 43, 47], "THE": [26, 28, 29, 30, 43, 47], "yellow": [26, 28, 29, 30, 43, 47], "lamp": [26, 28, 29, 30, 43, 47], "light": [26, 28, 29, 30, 43, 47], "AND": [26, 28, 29, 30, 43, 47], "THERE": [26, 28, 29, 30, 43, 47], "squalid": [26, 28, 29, 30, 43, 47], "quarter": [26, 28, 29, 30, 43, 47], "OF": [26, 28, 29, 30, 43, 47], "brothel": [26, 28, 29, 30, 43, 47], "god": [26, 43, 47], "AS": [26, 43, 47], "direct": [26, 43, 47], "consequ": [26, 43, 47], "sin": [26, 43, 47], "man": [26, 43, 47], "punish": [26, 43, 47], "her": [26, 43, 47], "love": [26, 43, 47], "child": [26, 43, 47], "whose": [26, 40, 43, 47], "ON": [26, 28, 43, 47], "THAT": [26, 43, 47], "dishonor": [26, 43, 47], "bosom": [26, 43, 47], "TO": [26, 43, 47], "parent": [26, 43, 47], "forev": [26, 43, 47], "WITH": [26, 43, 47], "race": [26, 43, 47], "descent": [26, 43, 47], "mortal": [26, 43, 47], "BE": [26, 43, 47], "bless": [26, 43, 47], "soul": [26, 43, 47], "IN": [26, 43, 47], "heaven": [26, 43, 47], "yet": [26, 28, 29, 43, 47], "THESE": [26, 43, 47], "thought": [26, 43, 47], "affect": [26, 43, 47], "hester": [26, 43, 47], "prynn": [26, 43, 47], "hope": [26, 39, 43, 47], "apprehens": [26, 43, 47], "390": 26, "down": [26, 38, 43, 46, 48, 49, 60, 61, 62], "reproduc": [26, 43], "9999": [26, 48, 49, 60], "symlink": 26, "pass": [26, 30, 38, 40, 41, 43, 46, 48, 49, 58, 60, 61, 62], "convemform": [27, 34, 58], "platform": [27, 31], "android": [27, 28, 29, 30, 31, 64], "raspberri": [27, 31], "pi": [27, 31], "\u7231\u82af\u6d3e": 27, "maix": 27, "iii": 27, "axera": 27, "rv1126": 27, "static": 27, "binari": [27, 28, 29, 30, 38, 40, 41, 43, 46, 54, 60, 61, 64], "pnnx": [27, 34], "conv": [28, 29], "emform": [28, 29, 32], "stateless2": [28, 29, 60], "pretrained_model": [28, 29, 30], "online_transduc": 28, "jit_xxx": [28, 29, 30], "anywher": [28, 29], "submodul": 28, "recurs": 28, "init": 28, "dcmake_build_typ": [28, 38, 43], "dncnn_python": 28, "dncnn_build_benchmark": 28, "dncnn_build_exampl": 28, "dncnn_build_tool": 28, "j4": 28, "pwd": 28, "compon": [28, 58], "ncnn2int8": [28, 29], "am": 28, "sai": [28, 29, 30, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62], "later": [28, 29, 30, 38, 41, 43, 46, 47, 48, 49, 51, 52, 60, 61, 62], "termin": 28, "tencent": [28, 29], "modif": [28, 40], "offici": 28, "synchron": 28, "renam": [28, 29, 30], "conv_emformer_transducer_stateless2": [28, 58], "cnn": [28, 30, 35], "context": [28, 35, 40, 46, 58, 60, 61, 62], "configur": [28, 30, 40, 44, 47, 51, 52, 54, 64, 65], "accordingli": [28, 29, 30], "yourself": [28, 29, 30, 44, 61, 62], "220": [28, 40, 41, 43], "229": [28, 38], "best_v": 28, "alid_epoch": 28, "subsampl": [28, 61, 62], "ing_factor": 28, "a34171ed85605b0926eebbd0463d059431f4f74a": 28, "dec": 28, "ver": 28, "ion": 28, "530e8a1": 28, "op": 28, "1220120619": [28, 29, 30], "7695ff496b": [28, 29, 30], "s9n4w": [28, 29, 30], "icefa": 28, "ll": 28, "transdu": 28, "cer": 28, "use_averaged_model": [28, 29, 30], "cnn_module_kernel": [28, 30], "left_context_length": 28, "chunk_length": 28, "right_context_length": 28, "memory_s": 28, "231": [28, 29, 30], "053": 28, "022": 28, "708": [28, 38, 40, 43, 54], "75490012": 28, "320": [28, 40], "682": 28, "lh": [28, 29, 30], "289m": 28, "jan": [28, 29, 30], "289": 28, "roughli": [28, 29, 30], "equal": [28, 29, 30, 61, 62], "1024": [28, 29, 30, 35, 60], "287": [28, 54], "1010k": [28, 29], "decoder_jit_trac": [28, 29, 30, 33, 60, 62], "283m": 28, "encoder_jit_trac": [28, 29, 30, 33, 60, 62], "0m": [28, 29], "joiner_jit_trac": [28, 29, 30, 33, 60, 62], "found": [28, 29, 30, 38, 40, 41, 43, 46, 48, 49, 54, 60, 61], "param": [28, 29, 30], "503k": [28, 29], "437": [28, 29, 30], "142m": 28, "79k": 28, "5m": [28, 29], "architectur": [28, 29, 30, 60], "editor": [28, 29, 30], "283": [28, 30], "1010": [28, 29], "503": [28, 29], "convers": [28, 29, 30], "half": [28, 29, 30, 46, 61, 62], "float16": [28, 29, 30], "occupi": [28, 29, 30], "twice": [28, 29, 30], "smaller": [28, 29, 30, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "fp16": [28, 29, 30, 35, 36, 46, 48, 49, 56, 60, 61, 62, 64, 65], "won": [28, 29, 30, 31, 38, 41, 43, 44, 46, 48, 49, 60, 61, 62], "accept": [28, 29, 30], "216": [28, 38, 43, 51, 52], "encoder_param_filenam": [28, 29, 30], "encoder_bin_filenam": [28, 29, 30], "decoder_param_filenam": [28, 29, 30], "decoder_bin_filenam": [28, 29, 30], "joiner_param_filenam": [28, 29, 30], "joiner_bin_filenam": [28, 29, 30], "sound_filenam": [28, 29, 30], "328": 28, "336": 28, "106000": [28, 29, 30, 43, 47], "581": [28, 47], "381": 28, "7767517": [28, 29, 30], "1060": 28, "1342": 28, "in0": [28, 29, 30], "explan": [28, 29, 30], "magic": [28, 29, 30], "intermedi": [28, 29, 30], "1061": 28, "sherpametadata": [28, 29, 30], "sherpa_meta_data1": [28, 29, 30], "newli": [28, 29, 30], "must": [28, 29, 30, 61], "pair": [28, 29, 30], "sad": [28, 29, 30], "rememb": [28, 29, 30], "anymor": [28, 29, 30], "flexibl": [28, 29, 30, 35], "edit": [28, 29, 30], "arm": [28, 29, 30], "aarch64": [28, 29, 30], "onc": [28, 29], "mayb": [28, 29], "year": [28, 29], "_jit_trac": [28, 29], "fp32": [28, 29], "doubl": [28, 29], "py38": [28, 29, 30], "arg": [28, 29], "wave_filenam": [28, 29], "16k": [28, 29], "hz": [28, 29, 51, 52], "mono": [28, 29], "calibr": [28, 29], "cat": [28, 29], "eof": [28, 29], "calcul": [28, 29, 48, 61, 62], "has_gpu": [28, 29], "config": [28, 29], "use_vulkan_comput": [28, 29], "conv_87": 28, "942385": [28, 29], "threshold": [28, 29, 48], "938493": 28, "968131": 28, "conv_88": 28, "442448": 28, "549335": 28, "167552": 28, "conv_89": 28, "228289": 28, "001738": 28, "871552": 28, "linear_90": 28, "976146": 28, "101789": 28, "267128": 28, "linear_91": 28, "962030": 28, "162033": 28, "602713": 28, "linear_92": 28, "323041": 28, "853959": 28, "953129": 28, "linear_94": 28, "905416": 28, "648006": 28, "323545": 28, "linear_93": 28, "474093": 28, "200188": 28, "linear_95": 28, "888012": 28, "403563": 28, "483986": 28, "linear_96": 28, "856741": 28, "398679": 28, "524273": 28, "linear_97": 28, "635942": 28, "613655": 28, "590950": 28, "linear_98": 28, "460340": 28, "670146": 28, "398010": 28, "linear_99": 28, "532276": 28, "585537": 28, "119396": 28, "linear_101": 28, "585871": 28, "719224": 28, "205809": 28, "linear_100": 28, "751382": 28, "081648": 28, "linear_102": 28, "593344": 28, "450581": 28, "551147": 28, "linear_103": 28, "592681": 28, "705824": 28, "257959": 28, "linear_104": 28, "752957": 28, "980955": 28, "110489": 28, "linear_105": 28, "696240": 28, "877193": 28, "608953": 28, "linear_106": 28, "059659": 28, "643138": 28, "048950": 28, "linear_108": 28, "975461": 28, "589567": 28, "671457": 28, "linear_107": 28, "190381": 28, "515701": 28, "linear_109": 28, "710759": 28, "305635": 28, "082436": 28, "linear_110": 28, "531228": 28, "731162": 28, "159557": 28, "linear_111": 28, "528083": 28, "259322": 28, "211544": 28, "linear_112": 28, "148807": 28, "500842": 28, "087374": 28, "linear_113": 28, "592566": 28, "948851": 28, "166611": 28, "linear_115": 28, "437109": 28, "608947": 28, "642395": 28, "linear_114": 28, "193942": 28, "503904": 28, "linear_116": 28, "966980": 28, "200896": 28, "676392": 28, "linear_117": 28, "451303": 28, "061664": 28, "951344": 28, "linear_118": 28, "077262": 28, "965800": 28, "023804": 28, "linear_119": 28, "671615": 28, "847613": 28, "198460": 28, "linear_120": 28, "625638": 28, "131427": 28, "556595": 28, "linear_122": 28, "274080": 28, "888716": 28, "978189": 28, "linear_121": 28, "420480": 28, "429659": 28, "linear_123": 28, "826197": 28, "599617": 28, "281532": 28, "linear_124": 28, "396383": 28, "325849": 28, "335875": 28, "linear_125": 28, "337198": 28, "941410": 28, "221970": 28, "linear_126": 28, "699965": 28, "842878": 28, "224073": 28, "linear_127": 28, "775370": 28, "884215": 28, "696438": 28, "linear_129": 28, "872276": 28, "837319": 28, "254213": 28, "linear_128": 28, "180057": 28, "687883": 28, "linear_130": 28, "150427": 28, "454298": 28, "765789": 28, "linear_131": 28, "112692": 28, "924847": 28, "025545": 28, "linear_132": 28, "852893": 28, "116593": 28, "749626": 28, "linear_133": 28, "517084": 28, "024665": 28, "275314": 28, "linear_134": 28, "683807": 28, "878618": 28, "743618": 28, "linear_136": 28, "421055": 28, "322729": 28, "086264": 28, "linear_135": 28, "309880": 28, "917679": 28, "linear_137": 28, "827781": 28, "744595": 28, "915554": 28, "linear_138": 28, "422395": 28, "742882": 28, "402161": 28, "linear_139": 28, "527538": 28, "866123": 28, "849449": 28, "linear_140": 28, "128619": 28, "657793": 28, "266134": 28, "linear_141": 28, "839593": 28, "845993": 28, "021378": 28, "linear_143": 28, "442304": 28, "099039": 28, "889746": 28, "linear_142": 28, "325038": 28, "849592": 28, "linear_144": 28, "929444": 28, "618206": 28, "605080": 28, "linear_145": 28, "382126": 28, "321095": 28, "625010": 28, "linear_146": 28, "894987": 28, "867645": 28, "836517": 28, "linear_147": 28, "915313": 28, "906028": 28, "886522": 28, "linear_148": 28, "614287": 28, "908151": 28, "496181": 28, "linear_150": 28, "724932": 28, "485588": 28, "312899": 28, "linear_149": 28, "161146": 28, "606939": 28, "linear_151": 28, "164453": 28, "847355": 28, "719223": 28, "linear_152": 28, "086471": 28, "984121": 28, "222834": 28, "linear_153": 28, "099524": 28, "991601": 28, "816805": 28, "linear_154": 28, "054585": 28, "489706": 28, "286930": 28, "linear_155": 28, "389185": 28, "100321": 28, "963501": 28, "linear_157": 28, "982999": 28, "154796": 28, "637253": 28, "linear_156": 28, "537706": 28, "875190": 28, "linear_158": 28, "420287": 28, "502287": 28, "531588": 28, "linear_159": 28, "014746": 28, "423280": 28, "477261": 28, "linear_160": 28, "633553": 28, "715335": 28, "220921": 28, "linear_161": 28, "371849": 28, "117830": 28, "815203": 28, "linear_162": 28, "492933": 28, "126283": 28, "623318": 28, "linear_164": 28, "697504": 28, "825712": 28, "317358": 28, "linear_163": 28, "078367": 28, "008038": 28, "linear_165": 28, "023975": 28, "836278": 28, "577358": 28, "linear_166": 28, "860619": 28, "259792": 28, "493614": 28, "linear_167": 28, "380934": 28, "496160": 28, "107042": 28, "linear_168": 28, "691216": 28, "733317": 28, "831076": 28, "linear_169": 28, "723948": 28, "952728": 28, "129707": 28, "linear_171": 28, "034811": 28, "366547": 28, "665123": 28, "linear_170": 28, "356277": 28, "710501": 28, "linear_172": 28, "556884": 28, "729481": 28, "166058": 28, "linear_173": 28, "033039": 28, "207264": 28, "442120": 28, "linear_174": 28, "597379": 28, "658676": 28, "768131": 28, "linear_2": [28, 29], "293503": 28, "305265": 28, "877850": 28, "linear_1": [28, 29], "812222": 28, "766452": 28, "487047": 28, "linear_3": [28, 29], "999999": 28, "999755": 28, "031174": 28, "wish": [28, 29], "955k": 28, "18k": 28, "inparam": [28, 29], "inbin": [28, 29], "outparam": [28, 29], "outbin": [28, 29], "99m": 28, "78k": 28, "774k": [28, 29], "496": [28, 29, 43, 47], "replac": [28, 29], "774": [28, 29], "convolut": [28, 29, 48, 58, 61], "exact": [28, 29], "4x": [28, 29], "comparison": 28, "468000": [29, 33, 60], "lstm_transducer_stateless2": [29, 33, 60], "862": 29, "222": [29, 41, 43], "865": 29, "is_pnnx": 29, "62e404dd3f3a811d73e424199b3408e309c06e1a": [29, 30], "6d7a559": [29, 30], "feb": [29, 30, 40], "rnn_hidden_s": 29, "aux_layer_period": 29, "235": 29, "239": [29, 40], "472": 29, "324": 29, "83137520": 29, "596": 29, "325": 29, "257024": 29, "781812": 29, "327": 29, "84176356": 29, "182": [29, 30, 38, 47], "183": [29, 51, 52], "335": 29, "tracerwarn": [29, 30], "boolean": [29, 30], "caus": [29, 30, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "incorrect": [29, 30, 40], "flow": [29, 30], "constant": [29, 30], "futur": [29, 30, 40, 66], "need_pad": 29, "bool": 29, "259": [29, 38], "339": 29, "207": [29, 41, 43], "324m": 29, "321": [29, 38], "318m": 29, "159m": 29, "21k": 29, "861": 29, "266": [29, 30, 43, 47], "431": 29, "342": 29, "343": 29, "379": 29, "268": [29, 43, 47], "317m": 29, "317": 29, "conv_15": 29, "930708": 29, "972025": 29, "conv_16": 29, "978855": 29, "031788": 29, "456645": 29, "conv_17": 29, "868437": 29, "830528": 29, "218575": 29, "linear_18": 29, "107259": 29, "194808": 29, "293236": 29, "linear_19": 29, "193777": 29, "634748": 29, "401705": 29, "linear_20": 29, "259933": 29, "606617": 29, "722160": 29, "linear_21": 29, "186600": 29, "790260": 29, "512129": 29, "linear_22": 29, "759041": 29, "265832": 29, "050053": 29, "linear_23": 29, "931209": 29, "099090": 29, "979767": 29, "linear_24": 29, "324160": 29, "215561": 29, "321835": 29, "linear_25": 29, "800708": 29, "599352": 29, "284134": 29, "linear_26": 29, "492444": 29, "153369": 29, "274391": 29, "linear_27": 29, "660161": 29, "720994": 29, "674126": 29, "linear_28": 29, "415265": 29, "174434": 29, "007133": 29, "linear_29": 29, "038418": 29, "118534": 29, "724262": 29, "linear_30": 29, "072084": 29, "936867": 29, "259155": 29, "linear_31": 29, "342712": 29, "599489": 29, "282787": 29, "linear_32": 29, "340535": 29, "120308": 29, "701103": 29, "linear_33": 29, "846987": 29, "630030": 29, "985939": 29, "linear_34": 29, "686298": 29, "204571": 29, "607586": 29, "linear_35": 29, "904821": 29, "575518": 29, "756420": 29, "linear_36": 29, "806659": 29, "585589": 29, "118401": 29, "linear_37": 29, "402340": 29, "047157": 29, "162680": 29, "linear_38": 29, "174589": 29, "923361": 29, "030258": 29, "linear_39": 29, "178576": 29, "556058": 29, "807705": 29, "linear_40": 29, "901954": 29, "301267": 29, "956539": 29, "linear_41": 29, "839805": 29, "597429": 29, "716181": 29, "linear_42": 29, "178945": 29, "651595": 29, "895699": 29, "829245": 29, "627592": 29, "637907": 29, "746186": 29, "255032": 29, "167313": 29, "000000": 29, "999756": 29, "031013": 29, "345k": 29, "17k": 29, "218m": 29, "counterpart": 29, "bit": [29, 38, 40, 41, 43, 47, 54], "4532": 29, "feedforward": [30, 35, 40, 46, 61], "384": [30, 35, 43], "unmask": [30, 35], "downsampl": [30, 35, 39], "factor": [30, 35, 38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "473": [30, 43], "246": [30, 40, 43, 51, 52], "477": 30, "warm_step": 30, "2000": [30, 41], "feedforward_dim": 30, "attention_dim": [30, 38, 40, 43], "encoder_unmasked_dim": 30, "zipformer_downsampling_factor": 30, "decode_chunk_len": 30, "257": [30, 40, 51, 52], "023": 30, "zipformer2": 30, "419": 30, "At": [30, 38, 43], "stack": 30, "downsampling_factor": 30, "037": 30, "655": 30, "346": 30, "68944004": 30, "347": 30, "260096": 30, "348": [30, 51], "716276": 30, "656": [30, 43], "349": 30, "69920376": 30, "351": 30, "353": 30, "174": [30, 43], "175": 30, "1344": 30, "cached_len": 30, "num_lay": 30, "1348": 30, "cached_avg": 30, "1352": 30, "cached_kei": 30, "1356": 30, "cached_v": 30, "1360": 30, "cached_val2": 30, "1364": 30, "cached_conv1": 30, "1368": 30, "cached_conv2": 30, "1373": 30, "left_context_len": 30, "1884": 30, "x_size": 30, "2442": 30, "2449": 30, "2469": 30, "2473": 30, "2483": 30, "kv_len": 30, "2570": 30, "attn_output": 30, "bsz": 30, "num_head": 30, "seq_len": 30, "head_dim": 30, "2926": 30, "lorder": 30, "2652": 30, "2653": 30, "embed_dim": 30, "2666": 30, "1543": 30, "in_x_siz": 30, "1637": 30, "1643": 30, "in_channel": 30, "1571": 30, "1763": 30, "src1": 30, "src2": 30, "1779": 30, "dim1": 30, "1780": 30, "dim2": 30, "_trace": 30, "958": 30, "tracer": 30, "tupl": 30, "namedtupl": 30, "absolut": 30, "know": [30, 44], "side": 30, "allow": [30, 46, 61], "behavior": [30, 40], "_c": 30, "_create_method_from_trac": 30, "646": 30, "357": 30, "embedding_out": 30, "686": 30, "361": [30, 43, 47], "735": 30, "269m": 30, "269": [30, 38, 51, 52], "725": [30, 47], "1022k": 30, "266m": 30, "8m": 30, "509k": 30, "133m": 30, "152k": 30, "4m": 30, "1022": 30, "509": 30, "360": 30, "365": 30, "280": [30, 43], "372": [30, 38], "state": [30, 38, 40, 41, 43, 46, 48, 49, 56, 60, 61, 62], "410": 30, "411": [30, 43], "2028": 30, "2547": 30, "2029": 30, "23316": 30, "23317": 30, "23318": 30, "23319": 30, "23320": 30, "amount": [30, 37, 39], "pad": [30, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "conv2dsubsampl": 30, "arrai": 30, "23300": 30, "repo_url": 31, "basenam": 31, "why": 32, "streaming_asr": [32, 33, 60, 61, 62], "conv_emform": 32, "offline_asr": [32, 46], "baz": 33, "compact": 35, "inject": 35, "competit": 35, "full": [35, 36, 43, 44, 46, 48, 49, 60, 61, 62], "subset": [35, 36, 43, 46, 48, 49, 60, 61, 62], "instruct": [35, 36], "intial": [35, 36], "decode_gigaspeech": [35, 36], "1000": [35, 36, 43, 64, 65], "insert": 35, "residu": 35, "zipformer2encoderlay": 35, "remain": 35, "untouch": 35, "experi": [35, 36, 38, 40, 41, 43, 44, 46, 48, 49, 54, 60, 61, 62], "do_finetun": [35, 36], "use_adapt": 35, "adapter_dim": 35, "zipformer_adapt": 35, "world": [35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 56, 60, 61, 62, 64, 65], "exp_giga_finetune_adapt": 35, "_adapter_dim": 35, "045": 35, "13022": 35, "ckpt": [35, 36], "certain": [35, 36, 37], "bottleneck": 35, "notic": 35, "trainal": 35, "2024": [35, 64], "808": [35, 43, 51], "1277": 35, "761344": 35, "trainabl": 35, "entir": 35, "deactiv": 35, "keep": [35, 40, 46, 61, 62], "768": 35, "1536": 35, "queri": 35, "po": 35, "causal": [35, 61], "previou": [36, 56], "stateless": [36, 39, 42, 46, 60, 61, 62], "due": [36, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "vocabulari": [36, 40], "use_mux": 36, "exp_giga_finetun": 36, "_mux": 36, "0045": 36, "mux": 36, "13024": 36, "forget": 36, "quickli": 36, "mix": 36, "maintain": 36, "ones": 36, "lower": [36, 60], "public": 37, "capabl": 37, "high": [37, 39, 64], "label": 37, "1best": [38, 41, 43, 47, 48, 49, 51, 52], "automag": [38, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "stop": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "By": [38, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "musan": [38, 41, 43, 44, 46, 48, 49, 60, 61, 62], "apt": [38, 41], "permiss": [38, 41], "commandlin": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "multi": [38, 40, 41, 43, 44, 46, 48, 49, 58, 60, 61, 62], "machin": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "ddp": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "implement": [38, 40, 41, 43, 44, 46, 48, 49, 58, 60, 61, 62], "utter": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "oom": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "decai": [38, 41, 43, 48, 49, 60], "warmup": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "function": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "get_param": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "directli": [38, 40, 41, 43, 44, 46, 48, 49, 60, 61, 62], "perturb": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "3x150": [38, 40, 41], "450": [38, 40, 41], "visual": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "logdir": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "labelsmooth": 38, "tensorflow": [38, 40, 41, 43, 46, 48, 49, 54, 60, 61], "press": [38, 40, 41, 43, 46, 48, 49, 54, 60, 61, 62], "ctrl": [38, 40, 41, 43, 46, 48, 49, 54, 60, 61, 62], "engw8ksktzqs24zbv5dgcg": 38, "2021": [38, 41, 43, 47, 51, 52, 54], "22t11": 38, "scan": [38, 40, 41, 43, 46, 54, 60, 61], "116068": 38, "scalar": [38, 40, 41, 43, 46, 54, 60, 61], "listen": [38, 40, 41, 46, 54, 60, 61], "xxxx": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "saw": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "consol": [38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "avoid": [38, 40, 43], "nbest": [38, 43, 49], "lattic": [38, 41, 43, 46, 47, 51, 52, 61, 62], "uniqu": [38, 43, 46, 61, 62], "pkufool": [38, 41, 47], "icefall_asr_aishell_conformer_ctc": 38, "transcrib": [38, 40, 41, 43], "lang_char": [38, 40], "bac009s0764w0121": [38, 40, 41], "bac009s0764w0122": [38, 40, 41], "bac009s0764w0123": [38, 40, 41], "tran": [38, 41, 43, 47, 51, 52], "conveni": [38, 41, 43, 44], "eo": [38, 41, 43], "soxi": [38, 40, 41, 43, 47, 54], "sampl": [38, 40, 41, 43, 47, 48, 54, 61, 62], "precis": [38, 40, 41, 43, 46, 47, 54, 61, 62], "67263": [38, 40, 41], "cdda": [38, 40, 41, 43, 47, 54], "sector": [38, 40, 41, 43, 47, 54], "135k": [38, 40, 41], "256k": [38, 40, 41, 43], "sign": [38, 40, 41, 43, 54], "integ": [38, 40, 41, 43, 54], "pcm": [38, 40, 41, 43, 54], "65840": [38, 40, 41], "308": [38, 40, 41], "625": [38, 40, 41], "132k": [38, 40, 41], "64000": [38, 40, 41], "300": [38, 40, 41, 43, 44, 46, 56, 61], "128k": [38, 40, 41, 54], "topologi": [38, 43], "num_decoder_lay": [38, 43], "vgg_frontend": [38, 40, 43], "use_feat_batchnorm": [38, 43], "f2fd997f752ed11bbef4c306652c433e83f9cf12": 38, "sun": 38, "sep": 38, "33cfe45": 38, "d57a873": 38, "nov": [38, 43], "hw": 38, "kangwei": 38, "icefall_aishell3": 38, "k2_releas": 38, "tokens_fil": 38, "num_path": [38, 43, 46, 61, 62], "ngram_lm_scal": [38, 43], "attention_decoder_scal": [38, 43], "nbest_scal": [38, 43], "sos_id": [38, 43], "eos_id": [38, 43], "4336": [38, 40], "293": [38, 43], "369": [38, 43], "\u751a": [38, 40], "\u81f3": [38, 40], "\u51fa": [38, 40], "\u73b0": [38, 40], "\u4ea4": [38, 40], "\u6613": [38, 40], "\u51e0": [38, 40], "\u4e4e": [38, 40], "\u505c": [38, 40], "\u6b62": 38, "\u7684": [38, 40, 41], "\u60c5": [38, 40], "\u51b5": [38, 40], "\u4e00": [38, 40], "\u4e8c": [38, 40], "\u7ebf": [38, 40, 41], "\u57ce": [38, 40], "\u5e02": [38, 40], "\u867d": [38, 40], "\u7136": [38, 40], "\u4e5f": [38, 40, 41], "\u5904": [38, 40], "\u4e8e": [38, 40], "\u8c03": [38, 40], "\u6574": [38, 40], "\u4e2d": [38, 40, 41], "\u4f46": [38, 40, 41], "\u56e0": [38, 40], "\u4e3a": [38, 40], "\u805a": [38, 40], "\u96c6": [38, 40], "\u4e86": [38, 40, 41], "\u8fc7": [38, 40], "\u591a": [38, 40], "\u516c": [38, 40], "\u5171": [38, 40], "\u8d44": [38, 40], "\u6e90": [38, 40], "371": 38, "683": 38, "684": [38, 54], "651": [38, 54], "654": 38, "659": 38, "752": 38, "887": 38, "340": 38, "370": 38, "\u751a\u81f3": [38, 41], "\u51fa\u73b0": [38, 41], "\u4ea4\u6613": [38, 41], "\u51e0\u4e4e": [38, 41], "\u505c\u6b62": 38, "\u60c5\u51b5": [38, 41], "\u4e00\u4e8c": [38, 41], "\u57ce\u5e02": [38, 41], "\u867d\u7136": [38, 41], "\u5904\u4e8e": [38, 41], "\u8c03\u6574": [38, 41], "\u56e0\u4e3a": [38, 41], "\u805a\u96c6": [38, 41], "\u8fc7\u591a": [38, 41], "\u516c\u5171": [38, 41], "\u8d44\u6e90": [38, 41], "recor": [38, 43], "highest": [38, 43], "966": 38, "821": 38, "822": 38, "826": 38, "916": 38, "345": 38, "889": 38, "limit": [38, 40, 43, 58, 61], "upgrad": [38, 43], "checkout": [38, 43], "hlg_decod": [38, 43], "four": [38, 43], "messag": [38, 43, 46, 48, 49, 60, 61, 62], "use_gpu": [38, 43], "word_tabl": [38, 43], "forward": [38, 43, 48], "cu": [38, 43], "char": [38, 43], "693": [38, 51], "nnet_output": [38, 43], "185": [38, 43, 54], "217": [38, 43], "mandarin": 39, "beij": 39, "shell": 39, "technologi": 39, "ltd": 39, "peopl": 39, "accent": 39, "area": 39, "invit": 39, "particip": 39, "conduct": 39, "indoor": 39, "fidel": 39, "microphon": 39, "16khz": 39, "manual": 39, "profession": 39, "annot": 39, "inspect": 39, "free": [39, 44, 56, 60], "academ": 39, "moder": 39, "research": 39, "openslr": [39, 56], "conv1d": [40, 46, 60, 61, 62], "tanh": 40, "borrow": 40, "ieeexplor": 40, "ieee": 40, "jsp": 40, "arnumb": 40, "9054419": 40, "predict": [40, 44, 46, 60, 61, 62], "87939824": 40, "optimized_transduc": 40, "technqiu": 40, "maximum": 40, "emit": 40, "simplifi": [40, 58], "significantli": 40, "degrad": 40, "exactli": 40, "unprun": 40, "advantag": 40, "minim": 40, "pruned_transducer_stateless": [40, 46, 58, 61], "altern": 40, "though": 40, "transducer_stateless_modifi": 40, "pr": 40, "ram": 40, "tri": 40, "prob": [40, 60], "219": [40, 43], "lagz6hrcqxoigbfd5e0y3q": 40, "03t14": 40, "8477": 40, "250": [40, 47], "sym": [40, 46, 61, 62], "beam_search": [40, 46, 61, 62], "decoding_method": 40, "beam_4": 40, "ensur": 40, "poor": 40, "531": [40, 41], "994": [40, 43], "027": 40, "encoder_out_dim": 40, "f4fefe4882bc0ae59af951da3f47335d5495ef71": 40, "50d2281": 40, "mar": 40, "0815224919": 40, "75d558775b": 40, "mmnv8": 40, "878": [40, 52], "880": 40, "891": 40, "userwarn": 40, "__floordiv__": 40, "round": 40, "toward": 40, "trunc": 40, "floor": 40, "div": 40, "rounding_mod": 40, "divis": 40, "x_len": 40, "\u6ede": 40, "322": 40, "759": 40, "760": 40, "919": 40, "922": 40, "929": 40, "046": 40, "319": [40, 43], "798": 40, "831": [40, 52], "215": [40, 43, 47], "402": 40, "topk_hyp_index": 40, "topk_index": 40, "logit": 40, "583": [40, 52], "lji9mwuorlow3jkdhxwk8a": 41, "13t11": 41, "4454": 41, "icefall_asr_aishell_tdnn_lstm_ctc": 41, "858": [41, 43], "389": [41, 43], "536": 41, "539": 41, "917": 41, "\u505c\u6ede": 41, "mmi": [42, 45], "blank": [42, 45], "skip": [42, 44, 45, 46, 60, 61, 62], "distil": [42, 45], "hubert": [42, 45], "ligru": [42, 50], "libri": [43, 44, 46, 48, 49, 60, 61, 62], "3x960": [43, 46, 48, 49, 60, 61, 62], "2880": [43, 46, 48, 49, 60, 61, 62], "lzgnetjwrxc3yghnmd4kpw": 43, "24t16": 43, "4540": 43, "sentenc": [43, 56], "piec": 43, "And": [43, 46, 48, 49, 60, 61, 62], "neither": 43, "nor": 43, "5000": 43, "033": 43, "537": 43, "full_libri": [43, 44], "464": 43, "548": 43, "776": 43, "652": [43, 54], "109226120": 43, "714": [43, 51], "206": 43, "944": 43, "1328": 43, "443": [43, 47], "2563": 43, "494": 43, "592": 43, "1715": 43, "52576": 43, "1424": 43, "807": 43, "506": 43, "362": 43, "1477": 43, "2922": 43, "4295": 43, "52343": 43, "396": 43, "3584": 43, "432": 43, "680": [43, 51], "_pickl": 43, "unpicklingerror": 43, "invalid": 43, "hlg_modifi": 43, "g_4_gram": [43, 47, 51, 52], "sentencepiec": 43, "875": [43, 47], "212k": 43, "267440": [43, 47], "1253": [43, 47], "535k": 43, "77200": [43, 47], "154k": 43, "554": 43, "7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4": 43, "8d93169": 43, "601": 43, "025": 43, "broffel": 43, "osom": 43, "723": 43, "775": 43, "881": 43, "571": 43, "857": 43, "979": 43, "055": 43, "051": 43, "363": 43, "959": [43, 52], "546": 43, "598": 43, "599": [43, 47], "833": 43, "834": 43, "915": 43, "076": 43, "397": 43, "999": [43, 46, 61, 62], "concaten": 43, "bucket": 43, "sampler": 43, "ctc_decod": 43, "ngram_lm_rescor": 43, "attention_rescor": 43, "228": 43, "543": 43, "topo": 43, "547": 43, "729": 43, "703": 43, "545": 43, "945": 43, "475": 43, "191": [43, 51, 52], "398": 43, "515": 43, "deseri": 43, "441": 43, "fsaclass": 43, "loadfsa": 43, "const": 43, "string": 43, "c10": 43, "ignor": 43, "589": 43, "attention_scal": 43, "188": 43, "984": 43, "624": 43, "519": [43, 52], "632": 43, "645": [43, 54], "243": 43, "303": 43, "179": 43, "knowledg": 44, "vector": 44, "mvq": 44, "kd": 44, "pruned_transducer_stateless4": [44, 46, 58, 61], "theoret": 44, "applic": 44, "minor": 44, "stop_stag": [44, 64, 65], "thing": 44, "distillation_with_hubert": 44, "Of": 44, "cours": 44, "xl": 44, "proce": 44, "960h": [44, 48], "use_extracted_codebook": 44, "augment": 44, "th": [44, 51, 52], "embedding_lay": 44, "num_codebook": 44, "under": [44, 56], "vq_fbank_layer36_cb8": 44, "whola": 44, "snippet": 44, "echo": 44, "awk": 44, "pruned_transducer_stateless6": 44, "12359": 44, "spec": 44, "warp": 44, "paid": 44, "suitabl": [46, 60, 61, 62], "pruned_transducer_stateless2": [46, 58, 61], "pruned_transducer_stateless5": [46, 58, 61], "scroll": [46, 48, 49, 60, 61, 62], "arxiv": [46, 60, 61, 62], "2206": [46, 60, 61, 62], "13236": [46, 60, 61, 62], "rework": [46, 58, 61], "daniel": [46, 61, 62], "joint": [46, 60, 61, 62], "contrari": [46, 60, 61, 62], "convent": [46, 60, 61, 62], "recurr": [46, 60, 61, 62], "2x": [46, 61, 62], "littl": [46, 61], "436000": [46, 48, 49, 60, 61, 62], "438000": [46, 48, 49, 60, 61, 62], "qogspbgsr8kzcrmmie9jgw": 46, "20t15": [46, 60, 61], "4468": [46, 60, 61], "210171": [46, 60, 61], "access": [46, 48, 49, 60, 61, 62], "googl": [46, 48, 49, 60, 61, 62], "6008": [46, 48, 49, 60, 61, 62], "localhost": [46, 48, 49, 60, 61, 62], "expos": [46, 48, 49, 60, 61, 62], "proxi": [46, 48, 49, 60, 61, 62], "bind_al": [46, 48, 49, 60, 61, 62], "fast_beam_search": [46, 48, 60, 61, 62], "474000": [46, 60, 61, 62], "largest": [46, 61, 62], "posterior": [46, 48, 61, 62], "algorithm": [46, 61, 62], "pdf": [46, 49, 61, 62], "1211": [46, 61, 62], "3711": [46, 61, 62], "espnet": [46, 61, 62], "net": [46, 61, 62], "beam_search_transduc": [46, 61, 62], "basic": [46, 61], "topk": [46, 61, 62], "expand": [46, 61, 62], "mode": [46, 61, 62], "being": [46, 61, 62], "hardcod": [46, 61, 62], "composit": [46, 61, 62], "hard": [46, 58, 61, 62], "2211": [46, 61, 62], "00484": [46, 61, 62], "fast_beam_search_lg": [46, 61, 62], "trivial": [46, 61, 62], "fast_beam_search_nbest": [46, 61, 62], "random_path": [46, 61, 62], "shortest": [46, 61, 62], "fast_beam_search_nbest_lg": [46, 61, 62], "logic": [46, 61, 62], "smallest": [46, 60, 61, 62], "normal": [47, 51, 52, 54, 61], "icefall_asr_librispeech_tdnn": 47, "lstm_ctc": 47, "flac": 47, "116k": 47, "140k": 47, "343k": 47, "164k": 47, "105k": 47, "174k": 47, "pretraind": 47, "584": [47, 52], "791": 47, "245": 47, "098": 47, "099": 47, "methond": [47, 51, 52], "631": 47, "010": 47, "guidanc": 48, "bigger": 48, "simpli": 48, "discard": 48, "prevent": 48, "lconv": 48, "encourag": [48, 49, 60], "stabil": [48, 49], "doesn": 48, "warm": [48, 49], "xyozukpeqm62hbilud4upa": [48, 49], "ctc_guide_decode_b": 48, "pretrained_ctc": 48, "jit_pretrained_ctc": 48, "100h": 48, "yfyeung": 48, "wechat": 49, "zipformer_mmi": 49, "worker": [49, 60], "hp": 49, "tdnn_ligru_ctc": 51, "enough": [51, 52, 54, 56], "luomingshuang": [51, 52], "icefall_asr_timit_tdnn_ligru_ctc": 51, "pretrained_average_9_25": 51, "fdhc0_si1559": [51, 52], "felc0_si756": [51, 52], "fmgd0_si1564": [51, 52], "ffprobe": [51, 52], "show_format": [51, 52], "nistspher": [51, 52], "database_id": [51, 52], "database_vers": [51, 52], "utterance_id": [51, 52], "dhc0_si1559": [51, 52], "sample_min": [51, 52], "4176": [51, 52], "sample_max": [51, 52], "5984": [51, 52], "bitrat": [51, 52], "pcm_s16le": [51, 52], "s16": [51, 52], "elc0_si756": [51, 52], "1546": [51, 52], "1989": [51, 52], "mgd0_si1564": [51, 52], "7626": [51, 52], "10573": [51, 52], "660": 51, "695": 51, "697": 51, "819": 51, "829": 51, "sil": [51, 52], "dh": [51, 52], "ih": [51, 52], "uw": [51, 52], "ah": [51, 52], "ii": [51, 52], "aa": [51, 52], "ei": [51, 52], "dx": [51, 52], "uh": [51, 52], "ng": [51, 52, 64], "eh": [51, 52], "jh": [51, 52], "er": [51, 52], "ai": [51, 52], "hh": [51, 52], "aw": 51, "ae": [51, 52], "705": 51, "715": 51, "720": 51, "251": [51, 52], "ch": 51, "icefall_asr_timit_tdnn_lstm_ctc": 52, "pretrained_average_16_25": 52, "816": 52, "827": 52, "unk": 52, "739": 52, "977": 52, "978": 52, "981": 52, "ow": 52, "ykubhb5wrmosxykid1z9eg": 54, "23t23": 54, "icefall_asr_yesno_tdnn": 54, "0_0_1_0_0_1_1_1": 54, "0_0_1_0_1_0_0_1": 54, "0_0_1_1_0_0_0_1": 54, "0_0_1_1_0_1_1_0": 54, "0_0_1_1_1_0_0_0": 54, "0_0_1_1_1_1_0_0": 54, "0_1_0_0_0_1_0_0": 54, "0_1_0_0_1_0_1_0": 54, "0_1_0_1_0_0_0_0": 54, "0_1_0_1_1_1_0_0": 54, "0_1_1_0_0_1_1_1": 54, "0_1_1_1_0_0_1_0": 54, "0_1_1_1_1_0_1_0": 54, "1_0_0_0_0_0_0_0": 54, "1_0_0_0_0_0_1_1": 54, "1_0_0_1_0_1_1_1": 54, "1_0_1_1_0_1_1_1": 54, "1_0_1_1_1_1_0_1": 54, "1_1_0_0_0_1_1_1": 54, "1_1_0_0_1_0_1_1": 54, "1_1_0_1_0_1_0_0": 54, "1_1_0_1_1_0_0_1": 54, "1_1_0_1_1_1_1_0": 54, "1_1_1_0_0_1_0_1": 54, "1_1_1_0_1_0_1_0": 54, "1_1_1_1_0_0_1_0": 54, "1_1_1_1_1_0_0_0": 54, "1_1_1_1_1_1_1_1": 54, "54080": 54, "507": 54, "108k": 54, "650": 54, "198": 54, "181": 54, "186": 54, "187": 54, "correctli": 54, "simplest": 54, "nnlm": 56, "complet": 56, "wget": [56, 64], "resourc": 56, "norm": 56, "gzip": 56, "prepare_lm_training_data": 56, "lm_data": 56, "grab": 56, "cup": 56, "coffe": 56, "sort_lm_training_data": 56, "sorted_lm_data": 56, "statist": 56, "lm_data_stat": 56, "aforement": 56, "repeat": 56, "rnn_lm": 56, "tie": 56, "hyper": [56, 64, 65], "coupl": [56, 64, 65], "dai": [56, 64, 65], "former": 58, "mask": [58, 61, 62], "wenet": 58, "did": 58, "request": 58, "complic": 58, "techniqu": 58, "bank": 58, "memor": 58, "histori": 58, "introduc": 58, "variant": 58, "pruned_stateless_emformer_rnnt2": 58, "conv_emformer_transducer_stateless": 58, "ourself": 58, "mechan": 58, "onlin": 60, "lstm_transducer_stateless": 60, "prepare_giga_speech": 60, "cj2vtpiwqhkn9q1tx6ptpg": 60, "dynam": [61, 62], "short": [61, 62], "2012": 61, "05481": 61, "flag": 61, "indic": [61, 62], "whether": 61, "uniformli": [61, 62], "seen": [61, 62], "97vkxf80ru61cnp2alwzzg": 61, "streaming_decod": [61, 62], "wise": [61, 62], "parallel": [61, 62], "bath": [61, 62], "parallelli": [61, 62], "seem": 61, "benefit": 61, "320m": 62, "550": 62, "basicli": 62, "scriptmodul": 62, "jit_trace_export": 62, "jit_trace_pretrain": 62, "monoton": 63, "condit": [64, 65], "variat": [64, 65], "autoencod": [64, 65], "adversari": [64, 65], "piper_phonem": 64, "numba": 64, "espnet_tts_frontend": 64, "monotonic_align": [64, 65], "build_ext": [64, 65], "inplac": [64, 65], "medium": 64, "ground": [64, 65], "truth": [64, 65], "test_onnx": [64, 65], "program": 64, "kotlin": 64, "java": 64, "swift": 64, "offlin": 64, "espeak": 64, "bz2": 64, "xf": 64, "thread": 64, "countri": 64, "plai": 64, "350": 65, "zrjin": 65, "synthesi": 66, "task": 66}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"follow": 0, "code": [0, 9], "style": 0, "contribut": [1, 3], "document": 1, "how": [2, 26, 32, 33], "creat": [2, 13, 20, 25], "recip": [2, 66], "data": [2, 9, 11, 20, 25, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "prepar": [2, 9, 11, 20, 25, 35, 36, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "train": [2, 9, 16, 22, 25, 28, 29, 30, 31, 35, 36, 37, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "decod": [2, 5, 6, 7, 9, 12, 25, 26, 31, 35, 38, 40, 41, 43, 44, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "pre": [2, 22, 28, 29, 30, 31, 35, 36, 37, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62], "model": [2, 5, 15, 22, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 56, 60, 61, 62, 64, 65], "lodr": [4, 6], "rnn": [4, 55, 56], "transduc": [4, 6, 7, 28, 29, 30, 40, 46, 60, 61, 62], "wer": [4, 6, 7, 43], "differ": [4, 6, 7, 17], "beam": [4, 6, 7, 40], "size": [4, 6, 7], "languag": [5, 56], "lm": [6, 43, 55], "rescor": [6, 38, 43], "base": [6, 18, 19, 20], "method": 6, "v": 6, "shallow": [6, 7], "fusion": [6, 7], "The": [6, 40], "number": 6, "each": [6, 20], "field": 6, "i": 6, "test": [6, 7, 20, 25, 28, 29, 30], "clean": [6, 7], "other": 6, "time": [6, 7], "docker": [8, 9], "introduct": [9, 58], "view": 9, "avail": 9, "tag": 9, "cuda": [9, 25], "enabl": 9, "imag": 9, "cpu": 9, "onli": 9, "download": [9, 11, 25, 28, 29, 30, 31, 38, 40, 41, 43, 46, 47, 48, 49, 51, 52, 54, 60, 61, 62, 64, 65], "run": [9, 26, 64], "gpu": 9, "yesno": [9, 53], "within": 9, "contain": 9, "updat": 9, "frequent": 10, "ask": 10, "question": 10, "faq": 10, "oserror": 10, "libtorch_hip": 10, "so": 10, "cannot": 10, "open": 10, "share": 10, "object": 10, "file": [10, 11, 20, 31, 64], "directori": 10, "attributeerror": 10, "modul": 10, "distutil": 10, "ha": 10, "attribut": 10, "version": 10, "importerror": 10, "libpython3": 10, "10": 10, "1": [10, 25, 28, 29, 30, 38, 40, 41, 43], "0": [10, 25], "No": 10, "For": [11, 12, 13, 15, 16], "more": [11, 12, 13, 15, 16], "curiou": [11, 12, 13, 15, 16], "A": 11, "quick": 11, "look": 11, "gener": [11, 20], "environ": [13, 20, 25], "setup": 13, "virtual": [13, 25], "instal": [13, 25, 28, 29, 30, 38, 40, 41, 43, 47, 51, 52, 64], "depend": [13, 64], "icefal": [13, 14, 24, 25, 28, 29, 30], "dummi": 14, "tutori": 14, "export": [15, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 46, 48, 49, 60, 61, 62, 64, 65], "paramet": 15, "via": [15, 28, 29, 30], "state_dict": [15, 26, 46, 48, 49, 60, 61, 62], "torch": [15, 25, 28, 29, 30, 32, 33, 46, 48, 49, 60, 61, 62], "jit": [15, 28, 29, 30, 32, 33, 46, 48, 49, 60, 61, 62], "script": [15, 32, 46, 48, 49, 61, 62], "onnx": [15, 31, 64], "two": 17, "approach": 17, "between": 17, "fst": [18, 20], "forc": [18, 19, 20], "align": [18, 19, 20, 64, 65], "content": [18, 24, 37, 66], "k2": [19, 25], "kaldi": 20, "get": 20, "comput": [20, 43], "log_prob": 20, "token2id": 20, "id2token": 20, "word2id": 20, "id2word": 20, "lexicon": [20, 64], "relat": 20, "convert": 20, "transcript": 20, "an": [20, 56], "graph": 20, "segment": 20, "word": 20, "us": [20, 26, 32, 33, 46, 48, 49, 60, 61, 62], "summari": 20, "huggingfac": [21, 23], "space": 23, "youtub": [23, 25], "video": [23, 25], "toolkit": 25, "cudnn": 25, "torchaudio": 25, "2": [25, 28, 29, 30, 38, 40, 41, 43], "3": [25, 28, 29, 30, 38, 40, 43], "lhots": 25, "4": [25, 28, 29, 30], "exampl": [25, 31, 38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "5": [25, 28, 29, 30], "6": [25, 28, 29, 30], "your": 25, "when": [26, 32, 33], "py": 26, "ncnn": [27, 28, 29, 30], "convemform": 28, "pnnx": [28, 29, 30], "trace": [28, 29, 30, 33, 60, 62], "torchscript": [28, 29, 30], "modifi": [28, 29, 30, 40], "encod": [28, 29, 30], "sherpa": [28, 29, 30, 31, 46, 61, 62, 64], "7": [28, 29], "option": [28, 29, 38, 41, 43, 46, 48, 49, 60, 61, 62], "int8": [28, 29], "quantiz": [28, 29], "lstm": [29, 41, 47, 52, 60], "stream": [30, 42, 57, 58, 61, 62], "zipform": [30, 35, 36, 48, 49, 62], "sound": 31, "finetun": [35, 36], "from": [35, 36], "adapt": 35, "fine": [35, 36, 37], "tune": [35, 36, 37], "supervis": 36, "tabl": [37, 66], "conform": [38, 43, 58], "ctc": [38, 41, 43, 47, 48, 51, 52, 54], "configur": [38, 41, 43, 46, 48, 49, 60, 61, 62], "log": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62], "usag": [38, 40, 41, 43, 46, 48, 49, 60, 61, 62, 64], "case": [38, 40, 41, 43], "kaldifeat": [38, 40, 41, 43, 47, 51, 52, 54], "hlg": [38, 41, 43], "attent": [38, 43], "colab": [38, 40, 41, 43, 47, 51, 52, 54], "notebook": [38, 40, 41, 43, 47, 51, 52, 54], "deploy": [38, 43], "c": [38, 43], "aishel": 39, "stateless": 40, "loss": 40, "todo": 40, "greedi": 40, "search": [40, 64, 65], "tdnn": [41, 47, 51, 52, 54], "non": 42, "asr": [42, 57], "n": 43, "gram": 43, "distil": 44, "hubert": 44, "codebook": 44, "index": 44, "librispeech": [45, 59], "prune": [46, 61], "statelessx": [46, 61], "pretrain": [46, 48, 49, 60, 61, 62, 64, 65], "deploi": [46, 61, 62], "infer": [47, 51, 52, 54, 64, 65], "blank": 48, "skip": 48, "mmi": 49, "timit": 50, "ligru": 51, "emform": 58, "which": 60, "simul": [61, 62], "real": [61, 62], "tt": 63, "vit": [64, 65], "ljspeech": 64, "extra": 64, "build": [64, 65], "monoton": [64, 65], "vctk": 65}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"Follow the code style": [[0, "follow-the-code-style"]], "Contributing to Documentation": [[1, "contributing-to-documentation"]], "How to create a recipe": [[2, "how-to-create-a-recipe"]], "Data Preparation": [[2, "data-preparation"], [11, "data-preparation"], [40, "data-preparation"]], "Training": [[2, "training"], [9, "training"], [16, "training"], [25, "training"], [38, "training"], [40, "training"], [41, "training"], [43, "training"], [44, "training"], [46, "training"], [47, "training"], [48, "training"], [49, "training"], [51, "training"], [52, "training"], [54, "training"], [60, "training"], [61, "training"], [62, "training"], [64, "training"], [65, "training"]], "Decoding": [[2, "decoding"], [9, "decoding"], [12, "decoding"], [25, "decoding"], [35, "decoding"], [38, "decoding"], [40, "decoding"], [41, "decoding"], [43, "decoding"], [44, "decoding"], [46, "decoding"], [47, "decoding"], [48, "decoding"], [49, "decoding"], [51, "decoding"], [52, "decoding"], [54, "decoding"], [60, "decoding"], [61, "decoding"], [62, "decoding"]], "Pre-trained model": [[2, "pre-trained-model"]], "Contributing": [[3, "contributing"]], "LODR for RNN Transducer": [[4, "lodr-for-rnn-transducer"]], "WER of LODR with different beam sizes": [[4, "id1"]], "Decoding with language models": [[5, "decoding-with-language-models"]], "LM rescoring for Transducer": [[6, "lm-rescoring-for-transducer"]], "WERs of LM rescoring with different beam sizes": [[6, "id1"]], "WERs of LM rescoring + LODR with different beam sizes": [[6, "id2"]], "LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)": [[6, "id3"]], "Shallow fusion for Transducer": [[7, "shallow-fusion-for-transducer"]], "WERs and decoding time (on test-clean) of shallow fusion with different beam sizes": [[7, "id2"]], "Docker": [[8, "docker"]], "Introduction": [[9, "introduction"], [58, "introduction"]], "View available tags": [[9, "view-available-tags"]], "CUDA-enabled docker images": [[9, "cuda-enabled-docker-images"]], "CPU-only docker images": [[9, "cpu-only-docker-images"]], "Download a docker image (CUDA)": [[9, "download-a-docker-image-cuda"]], "Download a docker image (CPU)": [[9, "download-a-docker-image-cpu"]], "Run a docker image with GPU": [[9, "run-a-docker-image-with-gpu"]], "Run a docker image with CPU": [[9, "run-a-docker-image-with-cpu"]], "Run yesno within a docker container": [[9, "run-yesno-within-a-docker-container"]], "Update the code": [[9, "update-the-code"]], "Data preparation": [[9, "data-preparation"], [25, "data-preparation"], [35, "data-preparation"], [36, "data-preparation"], [38, "data-preparation"], [41, "data-preparation"], [43, "data-preparation"], [44, "data-preparation"], [46, "data-preparation"], [47, "data-preparation"], [48, "data-preparation"], [49, "data-preparation"], [51, "data-preparation"], [52, "data-preparation"], [54, "data-preparation"], [60, "data-preparation"], [61, "data-preparation"], [62, "data-preparation"], [64, "data-preparation"], [65, "data-preparation"]], "Frequently Asked Questions (FAQs)": [[10, "frequently-asked-questions-faqs"]], "OSError: libtorch_hip.so: cannot open shared object file: no such file or directory": [[10, "oserror-libtorch-hip-so-cannot-open-shared-object-file-no-such-file-or-directory"]], "AttributeError: module \u2018distutils\u2019 has no attribute \u2018version\u2019": [[10, "attributeerror-module-distutils-has-no-attribute-version"]], "ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory": [[10, "importerror-libpython3-10-so-1-0-cannot-open-shared-object-file-no-such-file-or-directory"]], "For the more curious": [[11, "for-the-more-curious"], [12, "for-the-more-curious"], [13, "for-the-more-curious"], [15, "for-the-more-curious"], [16, "for-the-more-curious"]], "A quick look to the generated files": [[11, "a-quick-look-to-the-generated-files"]], "download": [[11, "download"]], "data": [[11, "data"]], "Environment setup": [[13, "environment-setup"]], "Create a virtual environment": [[13, "create-a-virtual-environment"]], "Install dependencies": [[13, "install-dependencies"]], "Install icefall": [[13, "install-icefall"]], "Icefall for dummies tutorial": [[14, "icefall-for-dummies-tutorial"]], "Model Export": [[15, "model-export"]], "Export the model parameters via model.state_dict()": [[15, "export-the-model-parameters-via-model-state-dict"]], "Export via torch.jit.script()": [[15, "export-via-torch-jit-script"]], "Export via torch.onnx.export()": [[15, "export-via-torch-onnx-export"]], "Two approaches": [[17, "two-approaches"]], "Differences between the two approaches": [[17, "differences-between-the-two-approaches"]], "FST-based forced alignment": [[18, "fst-based-forced-alignment"]], "Contents:": [[18, null], [24, null]], "k2-based forced alignment": [[19, "k2-based-forced-alignment"]], "Kaldi-based forced alignment": [[20, "kaldi-based-forced-alignment"]], "Prepare the environment": [[20, "prepare-the-environment"]], "Get the test data": [[20, "get-the-test-data"]], "Compute log_probs": [[20, "compute-log-probs"]], "Create token2id and id2token": [[20, "create-token2id-and-id2token"]], "Create word2id and id2word": [[20, "create-word2id-and-id2word"]], "Generate lexicon-related files": [[20, "generate-lexicon-related-files"]], "Convert transcript to an FST graph": [[20, "convert-transcript-to-an-fst-graph"]], "Force aligner": [[20, "force-aligner"]], "Segment each word using the computed alignments": [[20, "segment-each-word-using-the-computed-alignments"]], "Summary": [[20, "summary"]], "Huggingface": [[21, "huggingface"]], "Pre-trained models": [[22, "pre-trained-models"]], "Huggingface spaces": [[23, "huggingface-spaces"]], "YouTube Video": [[23, "youtube-video"], [25, "youtube-video"]], "Icefall": [[24, "icefall"]], "Installation": [[25, "installation"]], "(0) Install CUDA toolkit and cuDNN": [[25, "install-cuda-toolkit-and-cudnn"]], "(1) Install torch and torchaudio": [[25, "install-torch-and-torchaudio"]], "(2) Install k2": [[25, "install-k2"]], "(3) Install lhotse": [[25, "install-lhotse"]], "(4) Download icefall": [[25, "download-icefall"]], "Installation example": [[25, "installation-example"]], "(1) Create a virtual environment": [[25, "create-a-virtual-environment"]], "(2) Install CUDA toolkit and cuDNN": [[25, "id1"]], "(3) Install torch and torchaudio": [[25, "id2"]], "(4) Install k2": [[25, "id3"]], "(5) Install lhotse": [[25, "id5"]], "(6) Download icefall": [[25, "id6"]], "Test Your Installation": [[25, "test-your-installation"]], "Export model.state_dict()": [[26, "export-model-state-dict"], [46, "export-model-state-dict"], [48, "export-model-state-dict"], [49, "export-model-state-dict"], [60, "export-model-state-dict"], [61, "export-model-state-dict"], [62, "export-model-state-dict"]], "When to use it": [[26, "when-to-use-it"], [32, "when-to-use-it"], [33, "when-to-use-it"]], "How to export": [[26, "how-to-export"], [32, "how-to-export"], [33, "how-to-export"]], "How to use the exported model": [[26, "how-to-use-the-exported-model"], [32, "how-to-use-the-exported-model"]], "Use the exported model to run decode.py": [[26, "use-the-exported-model-to-run-decode-py"]], "Export to ncnn": [[27, "export-to-ncnn"]], "Export ConvEmformer transducer models to ncnn": [[28, "export-convemformer-transducer-models-to-ncnn"]], "1. Download the pre-trained model": [[28, "download-the-pre-trained-model"], [29, "download-the-pre-trained-model"], [30, "download-the-pre-trained-model"]], "2. Install ncnn and pnnx": [[28, "install-ncnn-and-pnnx"], [29, "install-ncnn-and-pnnx"], [30, "install-ncnn-and-pnnx"]], "3. Export the model via torch.jit.trace()": [[28, "export-the-model-via-torch-jit-trace"], [29, "export-the-model-via-torch-jit-trace"], [30, "export-the-model-via-torch-jit-trace"]], "4. Export torchscript model via pnnx": [[28, "export-torchscript-model-via-pnnx"], [29, "export-torchscript-model-via-pnnx"], [30, "export-torchscript-model-via-pnnx"]], "5. Test the exported models in icefall": [[28, "test-the-exported-models-in-icefall"], [29, "test-the-exported-models-in-icefall"], [30, "test-the-exported-models-in-icefall"]], "6. Modify the exported encoder for sherpa-ncnn": [[28, "modify-the-exported-encoder-for-sherpa-ncnn"], [29, "modify-the-exported-encoder-for-sherpa-ncnn"], [30, "modify-the-exported-encoder-for-sherpa-ncnn"]], "7. (Optional) int8 quantization with sherpa-ncnn": [[28, "optional-int8-quantization-with-sherpa-ncnn"], [29, "optional-int8-quantization-with-sherpa-ncnn"]], "Export LSTM transducer models to ncnn": [[29, "export-lstm-transducer-models-to-ncnn"]], "Export streaming Zipformer transducer models to ncnn": [[30, "export-streaming-zipformer-transducer-models-to-ncnn"]], "Export to ONNX": [[31, "export-to-onnx"]], "sherpa-onnx": [[31, "sherpa-onnx"]], "Example": [[31, "example"]], "Download the pre-trained model": [[31, "download-the-pre-trained-model"], [38, "download-the-pre-trained-model"], [40, "download-the-pre-trained-model"], [41, "download-the-pre-trained-model"], [43, "download-the-pre-trained-model"], [47, "download-the-pre-trained-model"], [51, "download-the-pre-trained-model"], [52, "download-the-pre-trained-model"], [54, "download-the-pre-trained-model"]], "Export the model to ONNX": [[31, "export-the-model-to-onnx"]], "Decode sound files with exported ONNX models": [[31, "decode-sound-files-with-exported-onnx-models"]], "Export model with torch.jit.script()": [[32, "export-model-with-torch-jit-script"]], "Export model with torch.jit.trace()": [[33, "export-model-with-torch-jit-trace"]], "How to use the exported models": [[33, "how-to-use-the-exported-models"]], "Model export": [[34, "model-export"]], "Finetune from a pre-trained Zipformer model with adapters": [[35, "finetune-from-a-pre-trained-zipformer-model-with-adapters"]], "Model preparation": [[35, "model-preparation"], [36, "model-preparation"]], "Fine-tune with adapter": [[35, "fine-tune-with-adapter"]], "Export the model": [[35, "export-the-model"]], "Finetune from a supervised pre-trained Zipformer model": [[36, "finetune-from-a-supervised-pre-trained-zipformer-model"]], "Fine-tune": [[36, "fine-tune"]], "Fine-tune a pre-trained model": [[37, "fine-tune-a-pre-trained-model"]], "Table of Contents": [[37, null], [66, null]], "Conformer CTC": [[38, "conformer-ctc"], [43, "conformer-ctc"]], "Configurable options": [[38, "configurable-options"], [41, "configurable-options"], [43, "configurable-options"], [46, "configurable-options"], [48, "configurable-options"], [49, "configurable-options"], [60, "configurable-options"], [61, "configurable-options"], [62, "configurable-options"]], "Pre-configured options": [[38, "pre-configured-options"], [41, "pre-configured-options"], [43, "pre-configured-options"], [46, "pre-configured-options"], [48, "pre-configured-options"], [49, "pre-configured-options"], [60, "pre-configured-options"], [61, "pre-configured-options"], [62, "pre-configured-options"]], "Training logs": [[38, "training-logs"], [40, "training-logs"], [41, "training-logs"], [43, "training-logs"], [46, "training-logs"], [48, "training-logs"], [49, "training-logs"], [60, "training-logs"], [61, "training-logs"], [62, "training-logs"]], "Usage examples": [[38, "usage-examples"], [40, "usage-examples"], [41, "usage-examples"], [43, "usage-examples"]], "Case 1": [[38, "case-1"], [40, "case-1"], [41, "case-1"], [43, "case-1"]], "Case 2": [[38, "case-2"], [40, "case-2"], [41, "case-2"], [43, "case-2"]], "Case 3": [[38, "case-3"], [40, "case-3"], [43, "case-3"]], "Pre-trained Model": [[38, "pre-trained-model"], [40, "pre-trained-model"], [41, "pre-trained-model"], [43, "pre-trained-model"], [47, "pre-trained-model"], [51, "pre-trained-model"], [52, "pre-trained-model"], [54, "pre-trained-model"]], "Install kaldifeat": [[38, "install-kaldifeat"], [40, "install-kaldifeat"], [41, "install-kaldifeat"], [43, "install-kaldifeat"], [47, "install-kaldifeat"], [51, "install-kaldifeat"], [52, "install-kaldifeat"]], "Usage": [[38, "usage"], [40, "usage"], [41, "usage"], [43, "usage"]], "CTC decoding": [[38, "ctc-decoding"], [43, "ctc-decoding"], [43, "id2"]], "HLG decoding": [[38, "hlg-decoding"], [38, "id2"], [41, "hlg-decoding"], [43, "hlg-decoding"], [43, "id3"]], "HLG decoding + attention decoder rescoring": [[38, "hlg-decoding-attention-decoder-rescoring"]], "Colab notebook": [[38, "colab-notebook"], [40, "colab-notebook"], [41, "colab-notebook"], [43, "colab-notebook"], [47, "colab-notebook"], [51, "colab-notebook"], [52, "colab-notebook"], [54, "colab-notebook"]], "Deployment with C++": [[38, "deployment-with-c"], [43, "deployment-with-c"]], "aishell": [[39, "aishell"]], "Stateless Transducer": [[40, "stateless-transducer"]], "The Model": [[40, "the-model"]], "The Loss": [[40, "the-loss"]], "Todo": [[40, "id1"]], "Greedy search": [[40, "greedy-search"]], "Beam search": [[40, "beam-search"]], "Modified Beam search": [[40, "modified-beam-search"]], "TDNN-LSTM CTC": [[41, "tdnn-lstm-ctc"]], "Non Streaming ASR": [[42, "non-streaming-asr"]], "HLG decoding + LM rescoring": [[43, "hlg-decoding-lm-rescoring"]], "HLG decoding + LM rescoring + attention decoder rescoring": [[43, "hlg-decoding-lm-rescoring-attention-decoder-rescoring"]], "Compute WER with the pre-trained model": [[43, "compute-wer-with-the-pre-trained-model"]], "HLG decoding + n-gram LM rescoring": [[43, "hlg-decoding-n-gram-lm-rescoring"]], "HLG decoding + n-gram LM rescoring + attention decoder rescoring": [[43, "hlg-decoding-n-gram-lm-rescoring-attention-decoder-rescoring"]], "Distillation with HuBERT": [[44, "distillation-with-hubert"]], "Codebook index preparation": [[44, "codebook-index-preparation"]], "LibriSpeech": [[45, "librispeech"], [59, "librispeech"]], "Pruned transducer statelessX": [[46, "pruned-transducer-statelessx"], [61, "pruned-transducer-statelessx"]], "Usage example": [[46, "usage-example"], [48, "usage-example"], [49, "usage-example"], [60, "usage-example"], [61, "usage-example"], [62, "usage-example"]], "Export Model": [[46, "export-model"], [61, "export-model"], [62, "export-model"]], "Export model using torch.jit.script()": [[46, "export-model-using-torch-jit-script"], [48, "export-model-using-torch-jit-script"], [49, "export-model-using-torch-jit-script"], [61, "export-model-using-torch-jit-script"], [62, "export-model-using-torch-jit-script"]], "Download pretrained models": [[46, "download-pretrained-models"], [48, "download-pretrained-models"], [49, "download-pretrained-models"], [60, "download-pretrained-models"], [61, "download-pretrained-models"], [62, "download-pretrained-models"], [64, "download-pretrained-models"], [65, "download-pretrained-models"]], "Deploy with Sherpa": [[46, "deploy-with-sherpa"], [61, "deploy-with-sherpa"], [62, "deploy-with-sherpa"]], "TDNN-LSTM-CTC": [[47, "tdnn-lstm-ctc"], [52, "tdnn-lstm-ctc"]], "Inference with a pre-trained model": [[47, "inference-with-a-pre-trained-model"], [51, "inference-with-a-pre-trained-model"], [52, "inference-with-a-pre-trained-model"], [54, "inference-with-a-pre-trained-model"]], "Zipformer CTC Blank Skip": [[48, "zipformer-ctc-blank-skip"]], "Export models": [[48, "export-models"], [49, "export-models"], [60, "export-models"], [64, "export-models"], [65, "export-models"]], "Zipformer MMI": [[49, "zipformer-mmi"]], "TIMIT": [[50, "timit"]], "TDNN-LiGRU-CTC": [[51, "tdnn-ligru-ctc"]], "YesNo": [[53, "yesno"]], "TDNN-CTC": [[54, "tdnn-ctc"]], "Download kaldifeat": [[54, "download-kaldifeat"]], "RNN-LM": [[55, "rnn-lm"]], "Train an RNN language model": [[56, "train-an-rnn-language-model"]], "Streaming ASR": [[57, "streaming-asr"]], "Streaming Conformer": [[58, "streaming-conformer"]], "Streaming Emformer": [[58, "streaming-emformer"]], "LSTM Transducer": [[60, "lstm-transducer"]], "Which model to use": [[60, "which-model-to-use"]], "Export model using torch.jit.trace()": [[60, "export-model-using-torch-jit-trace"], [62, "export-model-using-torch-jit-trace"]], "Simulate streaming decoding": [[61, "simulate-streaming-decoding"], [62, "simulate-streaming-decoding"]], "Real streaming decoding": [[61, "real-streaming-decoding"], [62, "real-streaming-decoding"]], "Zipformer Transducer": [[62, "zipformer-transducer"]], "TTS": [[63, "tts"]], "VITS-LJSpeech": [[64, "vits-ljspeech"]], "Install extra dependencies": [[64, "install-extra-dependencies"]], "Build Monotonic Alignment Search": [[64, "build-monotonic-alignment-search"], [65, "build-monotonic-alignment-search"]], "Inference": [[64, "inference"], [65, "inference"]], "Usage in sherpa-onnx": [[64, "usage-in-sherpa-onnx"]], "Install sherpa-onnx": [[64, "install-sherpa-onnx"]], "Download lexicon files": [[64, "download-lexicon-files"]], "Run sherpa-onnx": [[64, "run-sherpa-onnx"]], "VITS-VCTK": [[65, "vits-vctk"]], "Recipes": [[66, "recipes"]]}, "indexentries": {}})
\ No newline at end of file