diff --git a/docs/source/conf.py b/docs/source/conf.py index bf231e3c1..5a534e126 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -95,4 +95,7 @@ rst_epilog = """ .. _k2: https://github.com/k2-fsa/k2 .. _lhotse: https://github.com/lhotse-speech/lhotse .. _yesno: https://www.openslr.org/1/ +.. _Next-gen Kaldi: https://github.com/k2-fsa +.. _Kaldi: https://github.com/kaldi-asr/kaldi +.. _lilcom: https://github.com/danpovey/lilcom """ diff --git a/docs/source/for-dummies/data-preparation.rst b/docs/source/for-dummies/data-preparation.rst new file mode 100644 index 000000000..844ed3986 --- /dev/null +++ b/docs/source/for-dummies/data-preparation.rst @@ -0,0 +1,167 @@ +Data Preparation +================ + +The first step is to prepare the data for training. We have already provided +`prepare.sh `_ +that would prepare everything required for training. + +.. code-block:: + + cd /tmp/icefall + export PYTHONPATH=/tmp/icefall:$PYTHONPATH + cd egs/yesno/ASR + + ./prepare.sh + +Note that in each recipe from `icefall`_, there exists a file ``prepare.sh``, +which you should run before you run anything else. + +That is all you need for data preparation. + +For the more curious +-------------------- + +If you are wondering how to prepare your own dataset, please refer to the following +URLs for more details: + + - ``_ + + It contains recipes for a variety of dataset. If you want to add your own + dataset, please read recipes in this folder first. + + - ``_ + + The `yesno`_ recipe in `lhotse`_. + +If you already have a `Kaldi`_ dataset directory, which contains files like +``wav.scp``, ``feats.scp``, then you can refer to ``_. + +A quick look to the generated files +----------------------------------- + +``./prepare.sh`` puts generated files into two directories: + + - ``download`` + - ``data`` + +download +^^^^^^^^ + +The ``download`` directory contains downloaded dataset files: + +.. code-block:: bas + + tree -L 1 ./download/ + + ./download/ + |-- waves_yesno + `-- waves_yesno.tar.gz + +.. hint:: + + Please refer to ``_ + for how the data is downloaded and extracted. + +data +^^^^ + +.. code-block:: bash + + tree ./data/ + + ./data/ + |-- fbank + | |-- yesno_cuts_test.jsonl.gz + | |-- yesno_cuts_train.jsonl.gz + | |-- yesno_feats_test.lca + | `-- yesno_feats_train.lca + |-- lang_phone + | |-- HLG.pt + | |-- L.pt + | |-- L_disambig.pt + | |-- Linv.pt + | |-- lexicon.txt + | |-- lexicon_disambig.txt + | |-- tokens.txt + | `-- words.txt + |-- lm + | |-- G.arpa + | `-- G.fst.txt + `-- manifests + |-- yesno_recordings_test.jsonl.gz + |-- yesno_recordings_train.jsonl.gz + |-- yesno_supervisions_test.jsonl.gz + `-- yesno_supervisions_train.jsonl.gz + + 4 directories, 18 files + +**data/manifests**: + + This directory contains manifests. There are used to generate files in + ``data/fbank``. + + To give you an idea of what it contains, we examine the first few lines of + the manifests related to the ``train`` dataset. + + .. code-block:: bash + + cd data/manifests + gunzip -c yesno_recordings_train.jsonl.gz | head -n 3 + + The output is given below: + + .. code-block:: bash + + {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]} + {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]} + {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]} + + Please refer to ``_ + for the meaning of each field per line. + + .. code-block:: bash + + gunzip -c yesno_supervisions_train.jsonl.gz | head -n 3 + + The output is given below: + + .. code-block:: bash + + {"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"} + {"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"} + {"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"} + + Please refer to ``_ + for the meaning of each field per line. + +**data/fbank**: + + This directory contains everything from ``data/manifests``. Furthermore, it also contains features + for training. + + ``data/fbank/yesno_feats_train.lca`` contains the features for the train dataset. + Features are compressed using `lilcom`_. + + ``data/fbank/yesno_cuts_train.jsonl.gz`` stores the `CutSet `_, + which stores `RecordingSet `_, + `SupervisionSet `_, + and `FeatureSet `_. + + To give you an idea about what it looks like, we can run the following command: + + .. code-block:: bash + + cd data/fbank + + gunzip -c yesno_cuts_train.jsonl.gz | head -n 3 + + The output is given below: + + .. code-block:: bash + + {"id": "0_0_0_0_1_1_1_1-0", "start": 0, "duration": 6.35, "channel": 0, "supervisions": [{"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 635, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.35, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "0,13000,3570", "channels": 0}, "recording": {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}, "type": "MonoCut"} + {"id": "0_0_0_1_0_1_1_0-1", "start": 0, "duration": 6.11, "channel": 0, "supervisions": [{"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 611, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.11, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "16570,12964,2929", "channels": 0}, "recording": {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}, "type": "MonoCut"} + {"id": "0_0_1_0_0_1_1_0-2", "start": 0, "duration": 6.02, "channel": 0, "supervisions": [{"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 602, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.02, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "32463,12936,2696", "channels": 0}, "recording": {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}, "type": "MonoCut"} + + Note that ``yesno_cuts_train.jsonl.gz`` only stores the information about how to read the features. + The actual features are stored separately in ``data/fbank/yesno_feats_train.lca``. diff --git a/docs/source/for-dummies/environment-setup.rst b/docs/source/for-dummies/environment-setup.rst new file mode 100644 index 000000000..b7fb97ec2 --- /dev/null +++ b/docs/source/for-dummies/environment-setup.rst @@ -0,0 +1,98 @@ +Environment setup +================= + +We will create an environment for `Next-gen Kaldi`_ that runs on ``CPU`` +in this tutorial. + +.. note:: + + Since the `yesno`_ dataset used in this tutorial is very tiny, training on + ``CPU`` works very well for it. + + If your dataset is very large, e.g., hundreds or thousands of hours of + training data, please follow :ref:`install icefall` to install `icefall`_ + that works with ``GPU``. + + +Create a virtual environment +---------------------------- + +.. code-block:: bash + + virtualenv -p python3 /tmp/icefall_env + +The above command creates a virtual environment in the directory ``/tmp/icefall_env``. +You can select any directory you want. + +The output of the above command is given below: + +.. code-block:: bash + + Already using interpreter /usr/bin/python3 + Using base prefix '/usr' + New python executable in /tmp/icefall_env/bin/python3 + Also creating executable in /tmp/icefall_env/bin/python + Installing setuptools, pkg_resources, pip, wheel...done. + +Now we can activate the environment using: + +.. code-block:: bash + + source /tmp/icefall_env/bin/activate + +Install dependencies +-------------------- + +.. warning:: + + Remeber to activate your virtual environment before you continue! + +After activating the virtual environment, we can use the following command +to install dependencies of `icefall`_: + +.. hint:: + + Remeber that we will run this tutorial on ``CPU``, so we install + dependencies required only by running on ``CPU``. + +.. code-block:: bash + + # Caution: Installation order matters! + + # We use torch 2.0.0 and torchaduio 2.0.0 in this tutorial. + # Other versions should also work. + + pip install torch==2.0.0+cpu torchaudio==2.0.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + + # Now install k2 + # Please refer to https://k2-fsa.github.io/k2/installation/from_wheels.html#linux-cpu-example + + pip install k2==1.24.3.dev20230726+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu.html + + # Install the latest version of lhotse + + pip install git+https://github.com/lhotse-speech/lhotse + + +Install icefall +--------------- + +We will put the source code of `icefall`_ into the directory ``/tmp`` +You can select any directory you want. + +.. code-block:: bash + + cd /tmp + git clone https://github.com/k2-fsa/icefall + cd icefall + pip install -r ./requirements.txt + +.. code-block:: bash + + # Anytime we want to use icefall, we have to set the following + # environment variable + + export PYTHONPATH=/tmp/icefall:$PYTHONPATH + + +Congratulations! You have installed `icefall`_ successfully. diff --git a/docs/source/for-dummies/index.rst b/docs/source/for-dummies/index.rst new file mode 100644 index 000000000..b921ced43 --- /dev/null +++ b/docs/source/for-dummies/index.rst @@ -0,0 +1,21 @@ +Icefall for dummies tutorial +============================ + +This tutorial walks you step by step about how to create a simple +ASR (`Automatic Speech Recognition `_) +system with `Next-gen Kaldi`_. + +It uses the `yesno`_ dataset for demonstration. The `yesno`_ dataset +is very tiny and the model training can be finished within 20 seconds on ``CPU``. + +That also means you don't need a ``GPU`` to finish this tutorial. + +Let's get started! + +Please follow items below **sequentially**. + +.. toctree:: + :maxdepth: 2 + + ./environment-setup.rst + ./data-preparation.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 0fa8fdd1c..fb539d3f2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,6 +20,7 @@ speech recognition recipes using `k2 `_. :maxdepth: 2 :caption: Contents: + for-dummies/index.rst installation/index docker/index faqs