From 334f8bb906e92c1fb94bf697ca6c041e39f44e28 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 2 Mar 2022 18:01:10 +0800 Subject: [PATCH] WIP: Add stateless transducer tutorial. --- docs/source/conf.py | 3 + docs/source/recipes/aishell.rst | 10 - docs/source/recipes/aishell/index.rst | 22 ++ .../recipes/aishell/stateless_transducer.rst | 221 ++++++++++++++++++ docs/source/recipes/index.rst | 14 +- docs/source/recipes/librispeech.rst | 10 - docs/source/recipes/librispeech/index.rst | 8 + docs/source/recipes/timit.rst | 10 - docs/source/recipes/timit/index.rst | 9 + docs/source/recipes/timit/tdnn_ligru_ctc.rst | 2 +- .../images/tdnn-tensorboard-log.png} | Bin docs/source/recipes/yesno/index.rst | 7 + .../recipes/{yesno.rst => yesno/tdnn.rst} | 6 +- 13 files changed, 280 insertions(+), 42 deletions(-) delete mode 100644 docs/source/recipes/aishell.rst create mode 100644 docs/source/recipes/aishell/index.rst create mode 100644 docs/source/recipes/aishell/stateless_transducer.rst delete mode 100644 docs/source/recipes/librispeech.rst create mode 100644 docs/source/recipes/librispeech/index.rst delete mode 100644 docs/source/recipes/timit.rst create mode 100644 docs/source/recipes/timit/index.rst rename docs/source/recipes/{images/yesno-tdnn-tensorboard-log.png => yesno/images/tdnn-tensorboard-log.png} (100%) create mode 100644 docs/source/recipes/yesno/index.rst rename docs/source/recipes/{yesno.rst => yesno/tdnn.rst} (99%) diff --git a/docs/source/conf.py b/docs/source/conf.py index 599df8b3e..88522ff27 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -33,6 +33,7 @@ release = "0.1" # ones. extensions = [ "sphinx_rtd_theme", + "sphinx.ext.todo", ] # Add any paths that contain templates here, relative to this directory. @@ -74,3 +75,5 @@ html_context = { "github_version": "master", "conf_py_path": "/icefall/docs/source/", } + +todo_include_todos = True diff --git a/docs/source/recipes/aishell.rst b/docs/source/recipes/aishell.rst deleted file mode 100644 index 71ccaa1fc..000000000 --- a/docs/source/recipes/aishell.rst +++ /dev/null @@ -1,10 +0,0 @@ -Aishell -======= - -We provide the following models for the Aishell dataset: - -.. toctree:: - :maxdepth: 2 - - aishell/conformer_ctc - aishell/tdnn_lstm_ctc diff --git a/docs/source/recipes/aishell/index.rst b/docs/source/recipes/aishell/index.rst new file mode 100644 index 000000000..d072d6e9c --- /dev/null +++ b/docs/source/recipes/aishell/index.rst @@ -0,0 +1,22 @@ +aishell +======= + +Aishell is an open-source Chinese Mandarin speech corpus published by Beijing +Shell Shell Technology Co.,Ltd. + +400 people from different accent areas in China are invited to participate in +the recording, which is conducted in a quiet indoor environment using high +fidelity microphone and downsampled to 16kHz. The manual transcription accuracy +is above 95%, through professional speech annotation and strict quality +inspection. The data is free for academic use. We hope to provide moderate +amount of data for new researchers in the field of speech recognition. + +It can be downloaded from ``_ + +.. toctree:: + :maxdepth: 1 + + tdnn_lstm_ctc + conformer_ctc + stateless_transducer + diff --git a/docs/source/recipes/aishell/stateless_transducer.rst b/docs/source/recipes/aishell/stateless_transducer.rst new file mode 100644 index 000000000..46b52b257 --- /dev/null +++ b/docs/source/recipes/aishell/stateless_transducer.rst @@ -0,0 +1,221 @@ +Stateless Transducer +==================== + +This tutorial shows you how to do transducer training in ``icefall``. + +.. HINT:: + + Instead of using RNN-T or RNN transducer, we only use transducer + here. As you will see, there are no RNNs in the model. + +The Model +--------- + +The transducer model consists of 3 parts: + +- **Encoder**: It is a conformer encoder with the following parameters + + - Number of heads: 8 + - Attention dim: 512 + - Number of layers: 12 + - Feedforward dim: 2048 + +- **Decoder**: We use a stateless model consisting of: + + - An embedding layer with embedding dim 512 + - A Conv1d layer with a default kernel size 2 + +- **Joiner**: It consists of a ``nn.tanh()`` and a ``nn.Linear()``. + +.. Caution:: + + The decoder is stateless and very simple. It is borrowed from + ``_ + (Rnn-Transducer with Stateless Prediction Network) + + We make one modification to it: Place a Conv1d layer right after + the embedding layer. + +When using Chinese characters as modelling unit, whose vocabulary size +is 4335 in this specific dataset, +the number of parameters of the model is ``87939824``, i.e., about ``88 M``. + +The Loss +-------- + +We are using ``_ +to compute the transducer loss, which removes extra paddings +in loss computation to save memory. + +.. Hint:: + + ``optimized_transducer`` implements the technqiues proposed + in `Improving RNN Transducer Modeling for End-to-End Speech Recognition `_ to save memory. + + Furthermore, it supports ``modified transducer``, limiting the maximum + number of symbols that can be emitted per frame to 1, which simplifies + the decoding process significantly. Also, the experiment results + show that it does not degrade the performance. + + See ``_ + for what exactly modified transducer is. + + ``_ shows that + in the unpruned case ``optimized_transducer`` has the advantage about minimizing + memory usage. + +.. todo:: + + Add tutorial about ``pruned_transducer_stateless`` that uses k2 + pruned transducer loss. + +.. hint:: + + You can use:: + + pip install optimized_transducer + + to install ``optimized_transducer``. Refer to + ``_ for other + alternatives. + +Data Preparation +---------------- + +To prepare the data for training, please use the following commands: + +.. code-block:: bash + + cd egs/aishell/ASR + ./prepare.sh --stop-stage 4 + ./prepare.sh --stage 6 --stop-stage 6 + +.. note:: + + You can use ``./prepare.sh``, though it will generates FSTs that + are not used in transducer traning. + +When you finish running the script, you will get the following two folders: + + - ``data/fbank``: It saves the pre-computed features + - ``data/lang_char``: It contains tokens that will be used in the training + +Training +-------- + +.. code-block:: bash + + cd egs/aishell/ASR + ./transducer_stateless_modified/train.py --help + +shows you the training options that can be passed from the commandline. +The following options are used quite often: + + - ``--exp-dir`` + + The experiment folder to save logs and model checkpoints, + defaults to ``./transducer_stateless_modified/exp``. + + - ``--num-epochs`` + + It is the number of epochs to train. For instance, + ``./transducer_stateless_modified/train.py --num-epochs 30`` trains for 30 + epochs and generates ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-29.pt`` + in the folder set by ``--exp-dir``. + + - ``--start-epoch`` + + It's used to resume training. + ``./transducer_stateless_modified/train.py --start-epoch 10`` loads the + checkpoint from ``exp_dir/epoch-9.pt`` and starts + training from epoch 10, based on the state from epoch 9. + + - ``--world-size`` + + It is used for multi-GPU single-machine DDP training. + + - (a) If it is 1, then no DDP training is used. + + - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training. + + The following shows some use cases with it. + + **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and + GPU 2 for training. You can do the following: + + .. code-block:: bash + + $ cd egs/aishell/ASR + $ export CUDA_VISIBLE_DEVICES="0,2" + $ ./transducer_stateless_modified/train.py --world-size 2 + + **Use case 2**: You have 4 GPUs and you want to use all of them + for training. You can do the following: + + .. code-block:: bash + + $ cd egs/aishell/ASR + $ ./transducer_stateless_modified/train.py --world-size 4 + + **Use case 3**: You have 4 GPUs but you only want to use GPU 3 + for training. You can do the following: + + .. code-block:: bash + + $ cd egs/aishell/ASR + $ export CUDA_VISIBLE_DEVICES="3" + $ ./transducer_stateless_modified/train.py --world-size 1 + + .. CAUTION:: + + Only multi-GPU single-machine DDP training is implemented at present. + There is an on-going PR `` + that adds support for multi-GPU multi-machine DDP training. + + - ``--max-duration`` + + It specifies the number of seconds over all utterances in a + batch, before **padding**. + If you encounter CUDA OOM, please reduce it. For instance, if + your are using V100 NVIDIA GPU with 32 GB RAM, we recommend you + to set it to ``300``. + + .. HINT:: + + Due to padding, the number of seconds of all utterances in a + batch will usually be larger than ``--max-duration``. + + A larger value for ``--max-duration`` may cause OOM during training, + while a smaller value may increase the training time. You have to + tune it. + + - ``--lr-factor`` + + It contrals the learning rate. If you use single GPU training, you + may want to use a small value for it. If you use multiple GPUs for training, + you may increase it. + + - ``--context-size`` + + It specifies the kernel size in the decoder. Default value 2 means it + functions as a tri-gram LM. + + - ``--modified-transducer-prob`` + + It specifies the probability to use modified transducer loss. + If it is 0, then no modified transducer is used; if it is 1, + then it uses modified transducer loss for all batches. If it is + ``p``, it applies modified transducer with probability ``p``. + +There are some training options, e.g., +number of warmup steps, +that are not passed from the commandline. +They are pre-configured by the function ``get_params()`` in +`transducer_stateless_modified/train.py `_ + +If you need to change them, please modify ``./transducer_stateless_modified/train.py`` directly. + +.. CAUTION:: + + The training set is perturbed by speed with two factors: 0.9 and 1.1. + Each epoch actually processes ``3x150 == 450`` hours of data. diff --git a/docs/source/recipes/index.rst b/docs/source/recipes/index.rst index 78e9ea569..9d1d83d29 100644 --- a/docs/source/recipes/index.rst +++ b/docs/source/recipes/index.rst @@ -10,12 +10,10 @@ We may add recipes for other tasks as well in the future. .. Other recipes are listed in a alphabetical order. .. toctree:: - :maxdepth: 3 + :maxdepth: 2 + :caption: Table of Contents - yesno - - librispeech - - aishell - - timit + aishell/index + librispeech/index + timit/index + yesno/index diff --git a/docs/source/recipes/librispeech.rst b/docs/source/recipes/librispeech.rst deleted file mode 100644 index 946b23407..000000000 --- a/docs/source/recipes/librispeech.rst +++ /dev/null @@ -1,10 +0,0 @@ -LibriSpeech -=========== - -We provide the following models for the LibriSpeech dataset: - -.. toctree:: - :maxdepth: 2 - - librispeech/tdnn_lstm_ctc - librispeech/conformer_ctc diff --git a/docs/source/recipes/librispeech/index.rst b/docs/source/recipes/librispeech/index.rst new file mode 100644 index 000000000..5fa08ab6b --- /dev/null +++ b/docs/source/recipes/librispeech/index.rst @@ -0,0 +1,8 @@ +LibriSpeech +=========== + +.. toctree:: + :maxdepth: 1 + + tdnn_lstm_ctc + conformer_ctc diff --git a/docs/source/recipes/timit.rst b/docs/source/recipes/timit.rst deleted file mode 100644 index b630e2ce4..000000000 --- a/docs/source/recipes/timit.rst +++ /dev/null @@ -1,10 +0,0 @@ -TIMIT -=========== - -We provide the following models for the TIMIT dataset: - -.. toctree:: - :maxdepth: 2 - - timit/tdnn_lstm_ctc - timit/tdnn_ligru_ctc \ No newline at end of file diff --git a/docs/source/recipes/timit/index.rst b/docs/source/recipes/timit/index.rst new file mode 100644 index 000000000..17f40cdb7 --- /dev/null +++ b/docs/source/recipes/timit/index.rst @@ -0,0 +1,9 @@ +TIMIT +===== + +.. toctree:: + :maxdepth: 1 + + tdnn_ligru_ctc + tdnn_lstm_ctc + diff --git a/docs/source/recipes/timit/tdnn_ligru_ctc.rst b/docs/source/recipes/timit/tdnn_ligru_ctc.rst index 30877505f..186420ee7 100644 --- a/docs/source/recipes/timit/tdnn_ligru_ctc.rst +++ b/docs/source/recipes/timit/tdnn_ligru_ctc.rst @@ -1,5 +1,5 @@ TDNN-LiGRU-CTC -============= +============== This tutorial shows you how to run a TDNN-LiGRU-CTC model with the `TIMIT `_ dataset. diff --git a/docs/source/recipes/images/yesno-tdnn-tensorboard-log.png b/docs/source/recipes/yesno/images/tdnn-tensorboard-log.png similarity index 100% rename from docs/source/recipes/images/yesno-tdnn-tensorboard-log.png rename to docs/source/recipes/yesno/images/tdnn-tensorboard-log.png diff --git a/docs/source/recipes/yesno/index.rst b/docs/source/recipes/yesno/index.rst new file mode 100644 index 000000000..d68523a97 --- /dev/null +++ b/docs/source/recipes/yesno/index.rst @@ -0,0 +1,7 @@ +YesNo +===== + +.. toctree:: + :maxdepth: 1 + + tdnn diff --git a/docs/source/recipes/yesno.rst b/docs/source/recipes/yesno/tdnn.rst similarity index 99% rename from docs/source/recipes/yesno.rst rename to docs/source/recipes/yesno/tdnn.rst index cb425ad1d..e8b748e6b 100644 --- a/docs/source/recipes/yesno.rst +++ b/docs/source/recipes/yesno/tdnn.rst @@ -1,5 +1,5 @@ -yesno -===== +TDNN-CTC +======== This page shows you how to run the `yesno `_ recipe. It contains: @@ -145,7 +145,7 @@ In ``tdnn/exp``, you will find the following files: Note there is a URL in the above output, click it and you will see the following screenshot: - .. figure:: images/yesno-tdnn-tensorboard-log.png + .. figure:: images/tdnn-tensorboard-log.png :width: 600 :alt: TensorBoard screenshot :align: center