diff --git a/docs/source/conf.py b/docs/source/conf.py index 599df8b3e..88522ff27 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -33,6 +33,7 @@ release = "0.1" # ones. extensions = [ "sphinx_rtd_theme", + "sphinx.ext.todo", ] # Add any paths that contain templates here, relative to this directory. @@ -74,3 +75,5 @@ html_context = { "github_version": "master", "conf_py_path": "/icefall/docs/source/", } + +todo_include_todos = True diff --git a/docs/source/installation/images/README.md b/docs/source/installation/images/README.md new file mode 100644 index 000000000..97c1e993c --- /dev/null +++ b/docs/source/installation/images/README.md @@ -0,0 +1,4 @@ + +# Introduction + + is used to generate files in this directory. diff --git a/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg b/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg new file mode 100644 index 000000000..534b2e534 --- /dev/null +++ b/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg @@ -0,0 +1 @@ +k2: >= v1.9k2>= v1.9 \ No newline at end of file diff --git a/docs/source/installation/images/k2-v1.9-blueviolet.svg b/docs/source/installation/images/k2-v1.9-blueviolet.svg deleted file mode 100644 index 5a207b370..000000000 --- a/docs/source/installation/images/k2-v1.9-blueviolet.svg +++ /dev/null @@ -1 +0,0 @@ -k2: v1.9k2v1.9 \ No newline at end of file diff --git a/docs/source/installation/images/python-3.6_3.7_3.8_3.9-blue.svg b/docs/source/installation/images/python-3.6_3.7_3.8_3.9-blue.svg deleted file mode 100644 index befc1e19e..000000000 --- a/docs/source/installation/images/python-3.6_3.7_3.8_3.9-blue.svg +++ /dev/null @@ -1 +0,0 @@ -python: 3.6 | 3.7 | 3.8 | 3.9python3.6 | 3.7 | 3.8 | 3.9 diff --git a/docs/source/installation/images/python-gt-v3.6-blue.svg b/docs/source/installation/images/python-gt-v3.6-blue.svg new file mode 100644 index 000000000..4254dc58a --- /dev/null +++ b/docs/source/installation/images/python-gt-v3.6-blue.svg @@ -0,0 +1 @@ +python: >= 3.6python>= 3.6 \ No newline at end of file diff --git a/docs/source/installation/images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg b/docs/source/installation/images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg deleted file mode 100644 index 496e5a9ef..000000000 --- a/docs/source/installation/images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg +++ /dev/null @@ -1 +0,0 @@ -torch: 1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0torch1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0 diff --git a/docs/source/installation/images/torch-gt-v1.6.0-green.svg b/docs/source/installation/images/torch-gt-v1.6.0-green.svg new file mode 100644 index 000000000..d3ece9a17 --- /dev/null +++ b/docs/source/installation/images/torch-gt-v1.6.0-green.svg @@ -0,0 +1 @@ +torch: >= 1.6.0torch>= 1.6.0 \ No newline at end of file diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 0f846c77c..a8c3b6865 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -15,13 +15,13 @@ Installation .. |device| image:: ./images/device-CPU_CUDA-orange.svg :alt: Supported devices -.. |python_versions| image:: ./images/python-3.6_3.7_3.8_3.9-blue.svg +.. |python_versions| image:: ./images/python-gt-v3.6-blue.svg :alt: Supported python versions -.. |torch_versions| image:: ./images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg +.. |torch_versions| image:: ./images/torch-gt-v1.6.0-green.svg :alt: Supported PyTorch versions -.. |k2_versions| image:: ./images/k2-v1.9-blueviolet.svg +.. |k2_versions| image:: ./images/k2-gt-v1.9-blueviolet.svg :alt: Supported k2 versions ``icefall`` depends on `k2 `_ and diff --git a/docs/source/recipes/aishell.rst b/docs/source/recipes/aishell.rst deleted file mode 100644 index 71ccaa1fc..000000000 --- a/docs/source/recipes/aishell.rst +++ /dev/null @@ -1,10 +0,0 @@ -Aishell -======= - -We provide the following models for the Aishell dataset: - -.. toctree:: - :maxdepth: 2 - - aishell/conformer_ctc - aishell/tdnn_lstm_ctc diff --git a/docs/source/recipes/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png b/docs/source/recipes/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png new file mode 100644 index 000000000..6c84b28f2 Binary files /dev/null and b/docs/source/recipes/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png differ diff --git a/docs/source/recipes/aishell/index.rst b/docs/source/recipes/aishell/index.rst new file mode 100644 index 000000000..d072d6e9c --- /dev/null +++ b/docs/source/recipes/aishell/index.rst @@ -0,0 +1,22 @@ +aishell +======= + +Aishell is an open-source Chinese Mandarin speech corpus published by Beijing +Shell Shell Technology Co.,Ltd. + +400 people from different accent areas in China are invited to participate in +the recording, which is conducted in a quiet indoor environment using high +fidelity microphone and downsampled to 16kHz. The manual transcription accuracy +is above 95%, through professional speech annotation and strict quality +inspection. The data is free for academic use. We hope to provide moderate +amount of data for new researchers in the field of speech recognition. + +It can be downloaded from ``_ + +.. toctree:: + :maxdepth: 1 + + tdnn_lstm_ctc + conformer_ctc + stateless_transducer + diff --git a/docs/source/recipes/aishell/stateless_transducer.rst b/docs/source/recipes/aishell/stateless_transducer.rst new file mode 100644 index 000000000..e8137b8c1 --- /dev/null +++ b/docs/source/recipes/aishell/stateless_transducer.rst @@ -0,0 +1,714 @@ +Stateless Transducer +==================== + +This tutorial shows you how to do transducer training in ``icefall``. + +.. HINT:: + + Instead of using RNN-T or RNN transducer, we only use transducer + here. As you will see, there are no RNNs in the model. + +.. HINT:: + + We assume you have read the page :ref:`install icefall` and have setup + the environment for ``icefall``. + +.. HINT:: + + We recommend you to use a GPU or several GPUs to run this recipe. + +In this tutorial, you will learn: + + - (1) What does the transducer model look like + - (2) How to prepare data for training and decoding + - (3) How to start the training, either with a single GPU or with multiple GPUs + - (4) How to do decoding after training, with greedy search, beam search and, **modified beam search** + - (5) How to use a pre-trained model provided by us to transcribe sound files + + +The Model +--------- + +The transducer model consists of 3 parts: + +- **Encoder**: It is a conformer encoder with the following parameters + + - Number of heads: 8 + - Attention dim: 512 + - Number of layers: 12 + - Feedforward dim: 2048 + +- **Decoder**: We use a stateless model consisting of: + + - An embedding layer with embedding dim 512 + - A Conv1d layer with a default kernel size 2 (i.e. it sees 2 + symbols of left-context by default) + +- **Joiner**: It consists of a ``nn.tanh()`` and a ``nn.Linear()``. + +.. Caution:: + + The decoder is stateless and very simple. It is borrowed from + ``_ + (Rnn-Transducer with Stateless Prediction Network) + + We make one modification to it: Place a Conv1d layer right after + the embedding layer. + +When using Chinese characters as modelling unit, whose vocabulary size +is 4336 in this specific dataset, +the number of parameters of the model is ``87939824``, i.e., about ``88 M``. + +The Loss +-------- + +We are using ``_ +to compute the transducer loss, which removes extra paddings +in loss computation to save memory. + +.. Hint:: + + ``optimized_transducer`` implements the technqiues proposed + in `Improving RNN Transducer Modeling for End-to-End Speech Recognition `_ to save memory. + + Furthermore, it supports ``modified transducer``, limiting the maximum + number of symbols that can be emitted per frame to 1, which simplifies + the decoding process significantly. Also, the experiment results + show that it does not degrade the performance. + + See ``_ + for what exactly modified transducer is. + + ``_ shows that + in the unpruned case ``optimized_transducer`` has the advantage about minimizing + memory usage. + +.. todo:: + + Add tutorial about ``pruned_transducer_stateless`` that uses k2 + pruned transducer loss. + +.. hint:: + + You can use:: + + pip install optimized_transducer + + to install ``optimized_transducer``. Refer to + ``_ for other + alternatives. + +Data Preparation +---------------- + +To prepare the data for training, please use the following commands: + +.. code-block:: bash + + cd egs/aishell/ASR + ./prepare.sh --stop-stage 4 + ./prepare.sh --stage 6 --stop-stage 6 + +.. note:: + + You can use ``./prepare.sh``, though it will generate FSTs that + are not used in transducer training. + +When you finish running the script, you will get the following two folders: + + - ``data/fbank``: It saves the pre-computed features + - ``data/lang_char``: It contains tokens that will be used in the training + +Training +-------- + +.. code-block:: bash + + cd egs/aishell/ASR + ./transducer_stateless_modified/train.py --help + +shows you the training options that can be passed from the commandline. +The following options are used quite often: + + - ``--exp-dir`` + + The experiment folder to save logs and model checkpoints, + defaults to ``./transducer_stateless_modified/exp``. + + - ``--num-epochs`` + + It is the number of epochs to train. For instance, + ``./transducer_stateless_modified/train.py --num-epochs 30`` trains for 30 + epochs and generates ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-29.pt`` + in the folder set by ``--exp-dir``. + + - ``--start-epoch`` + + It's used to resume training. + ``./transducer_stateless_modified/train.py --start-epoch 10`` loads the + checkpoint from ``exp_dir/epoch-9.pt`` and starts + training from epoch 10, based on the state from epoch 9. + + - ``--world-size`` + + It is used for single-machine multi-GPU DDP training. + + - (a) If it is 1, then no DDP training is used. + + - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training. + + The following shows some use cases with it. + + **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and + GPU 2 for training. You can do the following: + + .. code-block:: bash + + $ cd egs/aishell/ASR + $ export CUDA_VISIBLE_DEVICES="0,2" + $ ./transducer_stateless_modified/train.py --world-size 2 + + **Use case 2**: You have 4 GPUs and you want to use all of them + for training. You can do the following: + + .. code-block:: bash + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/train.py --world-size 4 + + **Use case 3**: You have 4 GPUs but you only want to use GPU 3 + for training. You can do the following: + + .. code-block:: bash + + $ cd egs/aishell/ASR + $ export CUDA_VISIBLE_DEVICES="3" + $ ./transducer_stateless_modified/train.py --world-size 1 + + .. CAUTION:: + + Only single-machine multi-GPU DDP training is implemented at present. + There is an on-going PR ``_ + that adds support for multi-machine multi-GPU DDP training. + + - ``--max-duration`` + + It specifies the number of seconds over all utterances in a + batch **before padding**. + If you encounter CUDA OOM, please reduce it. For instance, if + your are using V100 NVIDIA GPU with 32 GB RAM, we recommend you + to set it to ``300`` when the vocabulary size is 500. + + .. HINT:: + + Due to padding, the number of seconds of all utterances in a + batch will usually be larger than ``--max-duration``. + + A larger value for ``--max-duration`` may cause OOM during training, + while a smaller value may increase the training time. You have to + tune it. + + - ``--lr-factor`` + + It controls the learning rate. If you use a single GPU for training, you + may want to use a small value for it. If you use multiple GPUs for training, + you may increase it. + + - ``--context-size`` + + It specifies the kernel size in the decoder. The default value 2 means it + functions as a tri-gram LM. + + - ``--modified-transducer-prob`` + + It specifies the probability to use modified transducer loss. + If it is 0, then no modified transducer is used; if it is 1, + then it uses modified transducer loss for all batches. If it is + ``p``, it applies modified transducer with probability ``p``. + +There are some training options, e.g., +number of warmup steps, +that are not passed from the commandline. +They are pre-configured by the function ``get_params()`` in +`transducer_stateless_modified/train.py `_ + +If you need to change them, please modify ``./transducer_stateless_modified/train.py`` directly. + +.. CAUTION:: + + The training set is perturbed by speed with two factors: 0.9 and 1.1. + Each epoch actually processes ``3x150 == 450`` hours of data. + +Training logs +~~~~~~~~~~~~~ + +Training logs and checkpoints are saved in the folder set by ``--exp-dir`` +(defaults to ``transducer_stateless_modified/exp``). You will find the following files in that directory: + + - ``epoch-0.pt``, ``epoch-1.pt``, ... + + These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``. + To resume training from some checkpoint, say ``epoch-10.pt``, you can use: + + .. code-block:: bash + + $ ./transducer_stateless_modified/train.py --start-epoch 11 + + - ``tensorboard/`` + + This folder contains TensorBoard logs. Training loss, validation loss, learning + rate, etc, are recorded in these logs. You can visualize them by: + + .. code-block:: bash + + $ cd transducer_stateless_modified/exp/tensorboard + $ tensorboard dev upload --logdir . --name "Aishell transducer training with icefall" --description "Training modified transducer, see https://github.com/k2-fsa/icefall/pull/219" + + It will print something like below: + + .. code-block:: + + TensorFlow installation not found - running with reduced feature set. + Upload started and will continue reading any new data as it's added to the logdir. + + To stop uploading, press Ctrl-C. + + New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/laGZ6HrcQxOigbFD5E0Y3Q/ + + [2022-03-03T14:29:45] Started scanning logdir. + [2022-03-03T14:29:48] Total uploaded: 8477 scalars, 0 tensors, 0 binary objects + Listening for new data in logdir... + + Note there is a `URL `_ in the + above output, click it and you will see the following screenshot: + + .. figure:: images/aishell-transducer_stateless_modified-tensorboard-log.png + :width: 600 + :alt: TensorBoard screenshot + :align: center + :target: https://tensorboard.dev/experiment/laGZ6HrcQxOigbFD5E0Y3Q + + TensorBoard screenshot. + + - ``log/log-train-xxxx`` + + It is the detailed training log in text format, same as the one + you saw printed to the console during training. + +Usage examples +~~~~~~~~~~~~~~ + +The following shows typical use cases: + +**Case 1** +^^^^^^^^^^ + +.. code-block:: bash + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/train.py --max-duration 250 + +It uses ``--max-duration`` of 250 to avoid OOM. + + +**Case 2** +^^^^^^^^^^ + +.. code-block:: bash + + $ cd egs/aishell/ASR + $ export CUDA_VISIBLE_DEVICES="0,3" + $ ./transducer_stateless_modified/train.py --world-size 2 + +It uses GPU 0 and GPU 3 for DDP training. + +**Case 3** +^^^^^^^^^^ + +.. code-block:: bash + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/train.py --num-epochs 10 --start-epoch 3 + +It loads checkpoint ``./transducer_stateless_modified/exp/epoch-2.pt`` and starts +training from epoch 3. Also, it trains for 10 epochs. + +Decoding +-------- + +The decoding part uses checkpoints saved by the training part, so you have +to run the training part first. + +.. code-block:: bash + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/decode.py --help + +shows the options for decoding. + +The commonly used options are: + + - ``--method`` + + This specifies the decoding method. Currently, it supports: + + - **greedy_search**. You can provide the commandline option ``--max-sym-per-frame`` + to limit the maximum number of symbols that can be emitted per frame. + + - **beam_search**. You can provide the commandline option ``--beam-size``. + + - **modified_beam_search**. You can also provide the commandline option ``--beam-size``. + To use this method, we assume that you have trained your model with modified transducer, + i.e., used the option ``--modified-transducer-prob`` in the training. + + The following command uses greedy search for decoding + + .. code-block:: + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/decode.py \ + --epoch 64 \ + --avg 33 \ + --exp-dir ./transducer_stateless_modified/exp \ + --max-duration 100 \ + --decoding-method greedy_search \ + --max-sym-per-frame 1 + + The following command uses beam search for decoding + + .. code-block:: + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/decode.py \ + --epoch 64 \ + --avg 33 \ + --exp-dir ./transducer_stateless_modified/exp \ + --max-duration 100 \ + --decoding-method beam_search \ + --beam-size 4 + + The following command uses ``modified`` beam search for decoding + + .. code-block:: + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/decode.py \ + --epoch 64 \ + --avg 33 \ + --exp-dir ./transducer_stateless_modified/exp \ + --max-duration 100 \ + --decoding-method modified_beam_search \ + --beam-size 4 + + - ``--max-duration`` + + It has the same meaning as the one used in training. A larger + value may cause OOM. + + - ``--epoch`` + + It specifies the checkpoint from which epoch that should be used for decoding. + + - ``--avg`` + + It specifies the number of models to average. For instance, if it is 3 and if + ``--epoch=10``, then it averages the checkpoints ``epoch-8.pt``, ``epoch-9.pt``, + and ``epoch-10.pt`` and the averaged checkpoint is used for decoding. + +After decoding, you can find the decoding logs and results in `exp_dir/log/`, e.g., +``exp_dir/log/greedy_search``. + +Pre-trained Model +----------------- + +We have uploaded a pre-trained model to +``_ + +We describe how to use the pre-trained model to transcribe a sound file or +multiple sound files in the following. + +Install kaldifeat +~~~~~~~~~~~~~~~~~ + +`kaldifeat `_ is used to +extract features for a single sound file or multiple sound files +at the same time. + +Please refer to ``_ for installation. + +Download the pre-trained model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following commands describe how to download the pre-trained model: + +.. code-block:: + + $ cd egs/aishell/ASR + $ mkdir tmp + $ cd tmp + $ git lfs install + $ git clone https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2022-03-01 + + +.. CAUTION:: + + You have to use ``git lfs`` to download the pre-trained model. + +After downloading, you will have the following files: + +.. code-block:: bash + + $ cd egs/aishell/ASR + $ tree tmp/icefall-aishell-transducer-stateless-modified-2022-03-01 + + +.. code-block:: bash + + tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/ + |-- README.md + |-- data + | `-- lang_char + | |-- L.pt + | |-- lexicon.txt + | |-- tokens.txt + | `-- words.txt + |-- exp + | `-- pretrained.pt + |-- log + | |-- errs-test-beam_4-epoch-64-avg-33-beam-4.txt + | |-- errs-test-greedy_search-epoch-64-avg-33-context-2-max-sym-per-frame-1.txt + | |-- log-decode-epoch-64-avg-33-beam-4-2022-03-02-12-05-03 + | |-- log-decode-epoch-64-avg-33-context-2-max-sym-per-frame-1-2022-02-28-18-13-07 + | |-- recogs-test-beam_4-epoch-64-avg-33-beam-4.txt + | `-- recogs-test-greedy_search-epoch-64-avg-33-context-2-max-sym-per-frame-1.txt + `-- test_wavs + |-- BAC009S0764W0121.wav + |-- BAC009S0764W0122.wav + |-- BAC009S0764W0123.wav + `-- transcript.txt + + 5 directories, 16 files + + +**File descriptions**: + + - ``data/lang_char`` + + It contains language related files. You can find the vocabulary size in ``tokens.txt``. + + - ``exp/pretrained.pt`` + + It contains pre-trained model parameters, obtained by averaging + checkpoints from ``epoch-32.pt`` to ``epoch-64.pt``. + Note: We have removed optimizer ``state_dict`` to reduce file size. + + - ``log`` + + It contains decoding logs and decoded results. + + - ``test_wavs`` + + It contains some test sound files from Aishell ``test`` dataset. + +The information of the test sound files is listed below: + +.. code-block:: bash + + $ soxi tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/*.wav + + Input File : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:04.20 = 67263 samples ~ 315.295 CDDA sectors + File Size : 135k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:04.12 = 65840 samples ~ 308.625 CDDA sectors + File Size : 132k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:04.00 = 64000 samples ~ 300 CDDA sectors + File Size : 128k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM + + Total Duration of 3 files: 00:00:12.32 + +Usage +~~~~~ + +.. code-block:: + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/pretrained.py --help + +displays the help information. + +It supports three decoding methods: + + - greedy search + - beam search + - modified beam search + +.. note:: + + In modified beam search, it limits the maximum number of symbols that can be + emitted per frame to 1. To use this method, you have to ensure that your model + has been trained with the option ``--modified-transducer-prob``. Otherwise, + it may give you poor results. + +Greedy search +^^^^^^^^^^^^^ + +The command to run greedy search is given below: + +.. code-block:: bash + + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/pretrained.py \ + --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \ + --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \ + --method greedy_search \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav + +The output is as follows: + +.. code-block:: + + 2022-03-03 15:35:26,531 INFO [pretrained.py:239] device: cuda:0 + 2022-03-03 15:35:26,994 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt + 2022-03-03 15:35:27,027 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'greedy_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336} + 2022-03-03 15:35:27,027 INFO [pretrained.py:248] About to create model + 2022-03-03 15:35:36,878 INFO [pretrained.py:257] Constructing Fbank computer + 2022-03-03 15:35:36,880 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'] + 2022-03-03 15:35:36,891 INFO [pretrained.py:273] Decoding started + /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + lengths = ((x_lens - 1) // 2 - 1) // 2 + 2022-03-03 15:35:37,163 INFO [pretrained.py:320] + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav: + 甚 至 出 现 交 易 几 乎 停 滞 的 情 况 + + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav: + 一 二 线 城 市 虽 然 也 处 于 调 整 中 + + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav: + 但 因 为 聚 集 了 过 多 公 共 资 源 + + 2022-03-03 15:35:37,163 INFO [pretrained.py:322] Decoding Done + +Beam search +^^^^^^^^^^^ + +The command to run beam search is given below: + +.. code-block:: bash + + + $ cd egs/aishell/ASR + + $ ./transducer_stateless_modified/pretrained.py \ + --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \ + --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \ + --method beam_search \ + --beam-size 4 \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav + +The output is as follows: + +.. code-block:: + + 2022-03-03 15:39:09,285 INFO [pretrained.py:239] device: cuda:0 + 2022-03-03 15:39:09,708 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt + 2022-03-03 15:39:09,759 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'beam_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336} + 2022-03-03 15:39:09,760 INFO [pretrained.py:248] About to create model + 2022-03-03 15:39:18,919 INFO [pretrained.py:257] Constructing Fbank computer + 2022-03-03 15:39:18,922 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'] + 2022-03-03 15:39:18,929 INFO [pretrained.py:273] Decoding started + /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + lengths = ((x_lens - 1) // 2 - 1) // 2 + 2022-03-03 15:39:21,046 INFO [pretrained.py:320] + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav: + 甚 至 出 现 交 易 几 乎 停 滞 的 情 况 + + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav: + 一 二 线 城 市 虽 然 也 处 于 调 整 中 + + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav: + 但 因 为 聚 集 了 过 多 公 共 资 源 + + 2022-03-03 15:39:21,047 INFO [pretrained.py:322] Decoding Done + +Modified Beam search +^^^^^^^^^^^^^^^^^^^^ + +The command to run modified beam search is given below: + +.. code-block:: bash + + + $ cd egs/aishell/ASR + + $ ./transducer_stateless_modified/pretrained.py \ + --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \ + --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \ + --method modified_beam_search \ + --beam-size 4 \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \ + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav + +The output is as follows: + +.. code-block:: + + 2022-03-03 15:41:23,319 INFO [pretrained.py:239] device: cuda:0 + 2022-03-03 15:41:23,798 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt + 2022-03-03 15:41:23,831 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'modified_beam_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336} + 2022-03-03 15:41:23,831 INFO [pretrained.py:248] About to create model + 2022-03-03 15:41:32,214 INFO [pretrained.py:257] Constructing Fbank computer + 2022-03-03 15:41:32,215 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'] + 2022-03-03 15:41:32,220 INFO [pretrained.py:273] Decoding started + /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + lengths = ((x_lens - 1) // 2 - 1) // 2 + /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/beam_search.py:402: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + topk_hyp_indexes = topk_indexes // logits.size(-1) + 2022-03-03 15:41:32,583 INFO [pretrained.py:320] + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav: + 甚 至 出 现 交 易 几 乎 停 滞 的 情 况 + + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav: + 一 二 线 城 市 虽 然 也 处 于 调 整 中 + + ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav: + 但 因 为 聚 集 了 过 多 公 共 资 源 + + 2022-03-03 15:41:32,583 INFO [pretrained.py:322] Decoding Done + +Colab notebook +-------------- + +We provide a colab notebook for this recipe showing how to use a pre-trained model to +transcribe sound files. + +|aishell asr stateless modified transducer colab notebook| + +.. |aishell asr stateless modified transducer colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/drive/12jpTxJB44vzwtcmJl2DTdznW0OawPb9H?usp=sharing diff --git a/docs/source/recipes/index.rst b/docs/source/recipes/index.rst index 78e9ea569..9d1d83d29 100644 --- a/docs/source/recipes/index.rst +++ b/docs/source/recipes/index.rst @@ -10,12 +10,10 @@ We may add recipes for other tasks as well in the future. .. Other recipes are listed in a alphabetical order. .. toctree:: - :maxdepth: 3 + :maxdepth: 2 + :caption: Table of Contents - yesno - - librispeech - - aishell - - timit + aishell/index + librispeech/index + timit/index + yesno/index diff --git a/docs/source/recipes/librispeech.rst b/docs/source/recipes/librispeech.rst deleted file mode 100644 index 946b23407..000000000 --- a/docs/source/recipes/librispeech.rst +++ /dev/null @@ -1,10 +0,0 @@ -LibriSpeech -=========== - -We provide the following models for the LibriSpeech dataset: - -.. toctree:: - :maxdepth: 2 - - librispeech/tdnn_lstm_ctc - librispeech/conformer_ctc diff --git a/docs/source/recipes/librispeech/index.rst b/docs/source/recipes/librispeech/index.rst new file mode 100644 index 000000000..5fa08ab6b --- /dev/null +++ b/docs/source/recipes/librispeech/index.rst @@ -0,0 +1,8 @@ +LibriSpeech +=========== + +.. toctree:: + :maxdepth: 1 + + tdnn_lstm_ctc + conformer_ctc diff --git a/docs/source/recipes/timit.rst b/docs/source/recipes/timit.rst deleted file mode 100644 index b630e2ce4..000000000 --- a/docs/source/recipes/timit.rst +++ /dev/null @@ -1,10 +0,0 @@ -TIMIT -=========== - -We provide the following models for the TIMIT dataset: - -.. toctree:: - :maxdepth: 2 - - timit/tdnn_lstm_ctc - timit/tdnn_ligru_ctc \ No newline at end of file diff --git a/docs/source/recipes/timit/index.rst b/docs/source/recipes/timit/index.rst new file mode 100644 index 000000000..17f40cdb7 --- /dev/null +++ b/docs/source/recipes/timit/index.rst @@ -0,0 +1,9 @@ +TIMIT +===== + +.. toctree:: + :maxdepth: 1 + + tdnn_ligru_ctc + tdnn_lstm_ctc + diff --git a/docs/source/recipes/timit/tdnn_ligru_ctc.rst b/docs/source/recipes/timit/tdnn_ligru_ctc.rst index 30877505f..186420ee7 100644 --- a/docs/source/recipes/timit/tdnn_ligru_ctc.rst +++ b/docs/source/recipes/timit/tdnn_ligru_ctc.rst @@ -1,5 +1,5 @@ TDNN-LiGRU-CTC -============= +============== This tutorial shows you how to run a TDNN-LiGRU-CTC model with the `TIMIT `_ dataset. diff --git a/docs/source/recipes/images/yesno-tdnn-tensorboard-log.png b/docs/source/recipes/yesno/images/tdnn-tensorboard-log.png similarity index 100% rename from docs/source/recipes/images/yesno-tdnn-tensorboard-log.png rename to docs/source/recipes/yesno/images/tdnn-tensorboard-log.png diff --git a/docs/source/recipes/yesno/index.rst b/docs/source/recipes/yesno/index.rst new file mode 100644 index 000000000..d68523a97 --- /dev/null +++ b/docs/source/recipes/yesno/index.rst @@ -0,0 +1,7 @@ +YesNo +===== + +.. toctree:: + :maxdepth: 1 + + tdnn diff --git a/docs/source/recipes/yesno.rst b/docs/source/recipes/yesno/tdnn.rst similarity index 99% rename from docs/source/recipes/yesno.rst rename to docs/source/recipes/yesno/tdnn.rst index cb425ad1d..e8b748e6b 100644 --- a/docs/source/recipes/yesno.rst +++ b/docs/source/recipes/yesno/tdnn.rst @@ -1,5 +1,5 @@ -yesno -===== +TDNN-CTC +======== This page shows you how to run the `yesno `_ recipe. It contains: @@ -145,7 +145,7 @@ In ``tdnn/exp``, you will find the following files: Note there is a URL in the above output, click it and you will see the following screenshot: - .. figure:: images/yesno-tdnn-tensorboard-log.png + .. figure:: images/tdnn-tensorboard-log.png :width: 600 :alt: TensorBoard screenshot :align: center diff --git a/egs/aishell/ASR/README.md b/egs/aishell/ASR/README.md index 1b3c5a2e3..d0a0c1829 100644 --- a/egs/aishell/ASR/README.md +++ b/egs/aishell/ASR/README.md @@ -1,7 +1,7 @@ # Introduction -Please refer to +Please refer to for how to run models in this recipe. # Transducers diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index 211a7d120..30b5c5c6f 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -1,7 +1,7 @@ # Introduction -Please refer to +Please refer to for how to run models in this recipe. # Transducers diff --git a/egs/timit/ASR/README.md b/egs/timit/ASR/README.md index 47103bc45..f10bfccfd 100644 --- a/egs/timit/ASR/README.md +++ b/egs/timit/ASR/README.md @@ -1,3 +1,3 @@ -Please refer to -for how to run models in this recipe. \ No newline at end of file +Please refer to +for how to run models in this recipe. diff --git a/egs/yesno/ASR/README.md b/egs/yesno/ASR/README.md index 6f57412c0..7257bad9a 100644 --- a/egs/yesno/ASR/README.md +++ b/egs/yesno/ASR/README.md @@ -10,5 +10,5 @@ get the following WER: ``` Please refer to - + for detailed instructions.