WIP: Add documentation. (#22)

* Begin to add documentation.

* WIP: Add documentation.

* Fix a typo.

* Add more doc for the recipe yesno.

* Add more doc for the yesno recipe.
This commit is contained in:
Fangjun Kuang 2021-08-24 14:28:08 +08:00 committed by GitHub
parent 57cb611665
commit 01da00dca0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 1352 additions and 2 deletions

1
docs/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
build/

20
docs/Makefile Normal file
View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

35
docs/make.bat Normal file
View File

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

1
docs/requirements.txt Normal file
View File

@ -0,0 +1 @@
sphinx_rtd_theme

Binary file not shown.

After

Width:  |  Height:  |  Size: 666 KiB

77
docs/source/conf.py Normal file
View File

@ -0,0 +1,77 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import sphinx_rtd_theme
# -- Project information -----------------------------------------------------
project = "icefall"
copyright = "2021, icefall development team"
author = "icefall development team"
# The full version, including alpha/beta/rc tags
release = "0.1"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx_rtd_theme",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
source_suffix = {
".rst": "restructuredtext",
}
master_doc = "index"
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
html_show_sourcelink = True
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static", "installation/images"]
pygments_style = "sphinx"
numfig = True
html_context = {
"display_github": True,
"github_user": "k2-fsa",
"github_repo": "icefall",
"github_version": "master",
"conf_py_path": "/icefall/docs/source/",
}

24
docs/source/index.rst Normal file
View File

@ -0,0 +1,24 @@
.. icefall documentation master file, created by
sphinx-quickstart on Mon Aug 23 16:07:39 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
icefall
=======
.. image:: _static/logo.png
:alt: icefall logo
:width: 168px
:align: center
:target: https://github.com/k2-fsa/icefall
Documentation for `icefall <https://github.com/k2-fsa/icefall>`_, containing
speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
.. toctree::
:maxdepth: 2
:caption: Contents:
installation/index
recipes/index

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="122" height="20" role="img" aria-label="device: CPU | CUDA"><title>device: CPU | CUDA</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="122" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="45" height="20" fill="#555"/><rect x="45" width="77" height="20" fill="#fe7d37"/><rect width="122" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="235" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="350">device</text><text x="235" y="140" transform="scale(.1)" fill="#fff" textLength="350">device</text><text aria-hidden="true" x="825" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="670">CPU | CUDA</text><text x="825" y="140" transform="scale(.1)" fill="#fff" textLength="670">CPU | CUDA</text></g></svg>

After

Width:  |  Height:  |  Size: 1.1 KiB

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="114" height="20" role="img" aria-label="os: Linux | macOS"><title>os: Linux | macOS</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="114" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="23" height="20" fill="#555"/><rect x="23" width="91" height="20" fill="#ff69b4"/><rect width="114" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="125" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="130">os</text><text x="125" y="140" transform="scale(.1)" fill="#fff" textLength="130">os</text><text aria-hidden="true" x="675" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="810">Linux | macOS</text><text x="675" y="140" transform="scale(.1)" fill="#fff" textLength="810">Linux | macOS</text></g></svg>

After

Width:  |  Height:  |  Size: 1.1 KiB

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="170" height="20" role="img" aria-label="python: 3.6 | 3.7 | 3.8 | 3.9"><title>python: 3.6 | 3.7 | 3.8 | 3.9</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="170" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="49" height="20" fill="#555"/><rect x="49" width="121" height="20" fill="#007ec6"/><rect width="170" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="255" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="390">python</text><text x="255" y="140" transform="scale(.1)" fill="#fff" textLength="390">python</text><text aria-hidden="true" x="1085" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="1110">3.6 | 3.7 | 3.8 | 3.9</text><text x="1085" y="140" transform="scale(.1)" fill="#fff" textLength="1110">3.6 | 3.7 | 3.8 | 3.9</text></g></svg>

After

Width:  |  Height:  |  Size: 1.2 KiB

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="286" height="20" role="img" aria-label="torch: 1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0"><title>torch: 1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="286" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="39" height="20" fill="#555"/><rect x="39" width="247" height="20" fill="#97ca00"/><rect width="286" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="205" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="290">torch</text><text x="205" y="140" transform="scale(.1)" fill="#fff" textLength="290">torch</text><text aria-hidden="true" x="1615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="2370">1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0</text><text x="1615" y="140" transform="scale(.1)" fill="#fff" textLength="2370">1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0</text></g></svg>

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@ -0,0 +1,469 @@
.. _install icefall:
Installation
============
- |os|
- |device|
- |python_versions|
- |torch_versions|
.. |os| image:: ./images/os-Linux_macOS-ff69b4.svg
:alt: Supported operating systems
.. |device| image:: ./images/device-CPU_CUDA-orange.svg
:alt: Supported devices
.. |python_versions| image:: ./images/python-3.6_3.7_3.8_3.9-blue.svg
:alt: Supported python versions
.. |torch_versions| image:: ./images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg
:alt: Supported PyTorch versions
icefall depends on `k2 <https://github.com/k2-fsa/k2>`_ and
`lhotse <https://github.com/lhotse-speech/lhotse>`_.
We recommend you to install ``k2`` first, as ``k2`` is bound to
a specific version of PyTorch after compilation. Install ``k2`` also
installs its dependency PyTorch, which can be reused by ``lhotse``.
(1) Install k2
--------------
Please refer to `<https://k2.readthedocs.io/en/latest/installation/index.html>`_
to install `k2`.
.. HINT::
If you have already installed PyTorch and don't want to replace it,
please install a version of k2 that is compiled against the version
of PyTorch you are using.
(2) Install lhotse
------------------
Please refer to `<https://lhotse.readthedocs.io/en/latest/getting-started.html#installation>`_
to install ``lhotse``.
.. HINT::
Install ``lhotse`` also installs its dependency `torchaudio <https://github.com/pytorch/audio>`_.
(3) Download icefall
--------------------
icefall is a collection of Python scripts, so you don't need to install it
and we don't provide a ``setup.py`` to install it.
What you need is to download it and set the environment variable ``PYTHONPATH``
to point to it.
Assume you want to place ``icefall`` in the folder ``/tmp``. The
following commands show you how to setup ``icefall``:
.. code-block:: bash
cd /tmp
git clone https://github.com/k2-fsa/icefall
cd icefall
pip install -r requirements.txt
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
.. HINT::
You can put several versions of ``icefall`` in the same virtual environment.
To switch among different versions of ``icefall``, just set ``PYTHONPATH``
to point to the version you want.
Installation example
--------------------
The following shows an example about setting up the environment.
(1) Create a virtual environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ virtualenv -p python3.8 test-icefall
created virtual environment CPython3.8.6.final.0-64 in 1540ms
creator CPython3Posix(dest=/ceph-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/fangjun/.local/share/v
irtualenv)
added seed packages: pip==21.1.3, setuptools==57.4.0, wheel==0.36.2
activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator
(2) Activate your virtual environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ source test-icefall/bin/activate
(3) Install k2
~~~~~~~~~~~~~~
.. code-block:: bash
$ pip install k2==1.4.dev20210822+cpu.torch1.9.0 -f https://k2-fsa.org/nightly/index.html
Looking in links: https://k2-fsa.org/nightly/index.html
Collecting k2==1.4.dev20210822+cpu.torch1.9.0
Downloading https://k2-fsa.org/nightly/whl/k2-1.4.dev20210822%2Bcpu.torch1.9.0-cp38-cp38-linux_x86_64.whl (1.6 MB)
|________________________________| 1.6 MB 185 kB/s
Collecting graphviz
Downloading graphviz-0.17-py3-none-any.whl (18 kB)
Collecting torch==1.9.0
Using cached torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
Collecting typing-extensions
Using cached typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, torch, graphviz, k2
Successfully installed graphviz-0.17 k2-1.4.dev20210822+cpu.torch1.9.0 torch-1.9.0 typing-extensions-3.10.0.0
.. WARNING::
We choose to install a CPU version of k2 for testing. You would probably want to install
a CUDA version of k2.
(4) Install lhotse
~~~~~~~~~~~~~~~~~~
.. code-block::
$ pip install git+https://github.com/lhotse-speech/lhotse
Collecting git+https://github.com/lhotse-speech/lhotse
Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-7b1b76ge
Running command git clone -q https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-7b1b76ge
Collecting audioread>=2.1.9
Using cached audioread-2.1.9-py3-none-any.whl
Collecting SoundFile>=0.10
Using cached SoundFile-0.10.3.post1-py2.py3-none-any.whl (21 kB)
Collecting click>=7.1.1
Using cached click-8.0.1-py3-none-any.whl (97 kB)
Collecting cytoolz>=0.10.1
Using cached cytoolz-0.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)
Collecting dataclasses
Using cached dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting h5py>=2.10.0
Downloading h5py-3.4.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.5 MB)
|________________________________| 4.5 MB 684 kB/s
Collecting intervaltree>=3.1.0
Using cached intervaltree-3.1.0-py2.py3-none-any.whl
Collecting lilcom>=1.1.0
Using cached lilcom-1.1.1-cp38-cp38-linux_x86_64.whl
Collecting numpy>=1.18.1
Using cached numpy-1.21.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.8 MB)
Collecting packaging
Using cached packaging-21.0-py3-none-any.whl (40 kB)
Collecting pyyaml>=5.3.1
Using cached PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
Collecting tqdm
Downloading tqdm-4.62.1-py2.py3-none-any.whl (76 kB)
|________________________________| 76 kB 2.7 MB/s
Collecting torchaudio==0.9.0
Downloading torchaudio-0.9.0-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
|________________________________| 1.9 MB 73.1 MB/s
Requirement already satisfied: torch==1.9.0 in ./test-icefall/lib/python3.8/site-packages (from torchaudio==0.9.0->lhotse===0.8.0.dev
-2a1410b-clean) (1.9.0)
Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch==1.9.0->torchaudio==0.9.0-
>lhotse===0.8.0.dev-2a1410b-clean) (3.10.0.0)
Collecting toolz>=0.8.0
Using cached toolz-0.11.1-py3-none-any.whl (55 kB)
Collecting sortedcontainers<3.0,>=2.0
Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Collecting cffi>=1.0
Using cached cffi-1.14.6-cp38-cp38-manylinux1_x86_64.whl (411 kB)
Collecting pycparser
Using cached pycparser-2.20-py2.py3-none-any.whl (112 kB)
Collecting pyparsing>=2.0.2
Using cached pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
Building wheels for collected packages: lhotse
Building wheel for lhotse (setup.py) ... done
Created wheel for lhotse: filename=lhotse-0.8.0.dev_2a1410b_clean-py3-none-any.whl size=342242 sha256=f683444afa4dc0881133206b4646a
9d0f774224cc84000f55d0a67f6e4a37997
Stored in directory: /tmp/pip-ephem-wheel-cache-ftu0qysz/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
WARNING: Built wheel for lhotse is invalid: Metadata 1.2 mandates PEP 440 version, but '0.8.0.dev-2a1410b-clean' is not
Failed to build lhotse
Installing collected packages: pycparser, toolz, sortedcontainers, pyparsing, numpy, cffi, tqdm, torchaudio, SoundFile, pyyaml, packa
ging, lilcom, intervaltree, h5py, dataclasses, cytoolz, click, audioread, lhotse
Running setup.py install for lhotse ... done
DEPRECATION: lhotse was installed using the legacy 'setup.py install' method, because a wheel could not be built for it. A possible
replacement is to fix the wheel build issue reported above. You can find discussion regarding this at https://github.com/pypa/pip/is
sues/8368.
Successfully installed SoundFile-0.10.3.post1 audioread-2.1.9 cffi-1.14.6 click-8.0.1 cytoolz-0.11.0 dataclasses-0.6 h5py-3.4.0 inter
valtree-3.1.0 lhotse-0.8.0.dev-2a1410b-clean lilcom-1.1.1 numpy-1.21.2 packaging-21.0 pycparser-2.20 pyparsing-2.4.7 pyyaml-5.4.1 sor
tedcontainers-2.4.0 toolz-0.11.1 torchaudio-0.9.0 tqdm-4.62.1
**NOTE**: After installing ``lhotse``, you will encounter the following error:
.. code-block::
$ lhotse download --help
-bash: /ceph-fj/fangjun/test-icefall/bin/lhotse: python: bad interpreter: No such file or directory
The correct fix is:
.. code-block::
echo '#!/usr/bin/env python3' | cat - $(which lhotse) > /tmp/lhotse-bin
chmod +x /tmp/lhotse-bin
mv /tmp/lhotse-bin $(which lhotse)
(5) Download icefall
~~~~~~~~~~~~~~~~~~~~
.. code-block::
$ cd /tmp
$ git clone https://github.com/k2-fsa/icefall
Cloning into 'icefall'...
remote: Enumerating objects: 500, done.
remote: Counting objects: 100% (500/500), done.
remote: Compressing objects: 100% (308/308), done.
remote: Total 500 (delta 263), reused 307 (delta 102), pack-reused 0
Receiving objects: 100% (500/500), 172.49 KiB | 385.00 KiB/s, done.
Resolving deltas: 100% (263/263), done.
$ cd icefall
$ pip install -r requirements.txt
Collecting kaldilm
Downloading kaldilm-1.8.tar.gz (48 kB)
|________________________________| 48 kB 574 kB/s
Collecting kaldialign
Using cached kaldialign-0.2-cp38-cp38-linux_x86_64.whl
Collecting sentencepiece>=0.1.96
Using cached sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Collecting tensorboard
Using cached tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
Requirement already satisfied: setuptools>=41.0.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r
requirements.txt (line 4)) (57.4.0)
Collecting absl-py>=0.4
Using cached absl_py-0.13.0-py3-none-any.whl (132 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1
Using cached google_auth_oauthlib-0.4.5-py2.py3-none-any.whl (18 kB)
Collecting grpcio>=1.24.3
Using cached grpcio-1.39.0-cp38-cp38-manylinux2014_x86_64.whl (4.3 MB)
Requirement already satisfied: wheel>=0.26 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r require
ments.txt (line 4)) (0.36.2)
Requirement already satisfied: numpy>=1.12.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r requi
rements.txt (line 4)) (1.21.2)
Collecting protobuf>=3.6.0
Using cached protobuf-3.17.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
Collecting werkzeug>=0.11.15
Using cached Werkzeug-2.0.1-py3-none-any.whl (288 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0
Using cached tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
Collecting google-auth<2,>=1.6.3
Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
|________________________________| 152 kB 1.4 MB/s
Collecting requests<3,>=2.21.0
Using cached requests-2.26.0-py2.py3-none-any.whl (62 kB)
Collecting tensorboard-plugin-wit>=1.6.0
Using cached tensorboard_plugin_wit-1.8.0-py3-none-any.whl (781 kB)
Collecting markdown>=2.6.8
Using cached Markdown-3.3.4-py3-none-any.whl (97 kB)
Collecting six
Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Collecting cachetools<5.0,>=2.0.0
Using cached cachetools-4.2.2-py3-none-any.whl (11 kB)
Collecting rsa<5,>=3.1.4
Using cached rsa-4.7.2-py3-none-any.whl (34 kB)
Collecting pyasn1-modules>=0.2.1
Using cached pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
Collecting requests-oauthlib>=0.7.0
Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting pyasn1<0.5.0,>=0.4.6
Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)
Collecting urllib3<1.27,>=1.21.1
Using cached urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
Collecting certifi>=2017.4.17
Using cached certifi-2021.5.30-py2.py3-none-any.whl (145 kB)
Collecting charset-normalizer~=2.0.0
Using cached charset_normalizer-2.0.4-py3-none-any.whl (36 kB)
Collecting idna<4,>=2.5
Using cached idna-3.2-py3-none-any.whl (59 kB)
Collecting oauthlib>=3.0.0
Using cached oauthlib-3.1.1-py2.py3-none-any.whl (146 kB)
Building wheels for collected packages: kaldilm
Building wheel for kaldilm (setup.py) ... done
Created wheel for kaldilm: filename=kaldilm-1.8-cp38-cp38-linux_x86_64.whl size=897233 sha256=eccb906cafcd45bf9a7e1a1718e4534254bfb
f4c0d0cbc66eee6c88d68a63862
Stored in directory: /root/fangjun/.cache/pip/wheels/85/7d/63/f2dd586369b8797cb36d213bf3a84a789eeb92db93d2e723c9
Successfully built kaldilm
Installing collected packages: urllib3, pyasn1, idna, charset-normalizer, certifi, six, rsa, requests, pyasn1-modules, oauthlib, cach
etools, requests-oauthlib, google-auth, werkzeug, tensorboard-plugin-wit, tensorboard-data-server, protobuf, markdown, grpcio, google
-auth-oauthlib, absl-py, tensorboard, sentencepiece, kaldilm, kaldialign
Successfully installed absl-py-0.13.0 cachetools-4.2.2 certifi-2021.5.30 charset-normalizer-2.0.4 google-auth-1.35.0 google-auth-oaut
hlib-0.4.5 grpcio-1.39.0 idna-3.2 kaldialign-0.2 kaldilm-1.8 markdown-3.3.4 oauthlib-3.1.1 protobuf-3.17.3 pyasn1-0.4.8 pyasn1-module
s-0.2.8 requests-2.26.0 requests-oauthlib-1.3.0 rsa-4.7.2 sentencepiece-0.1.96 six-1.16.0 tensorboard-2.6.0 tensorboard-data-server-0
.6.1 tensorboard-plugin-wit-1.8.0 urllib3-1.26.6 werkzeug-2.0.1
Test Your Installation
----------------------
To test that your installation is successful, let us run
the `yesno recipe <https://github.com/k2-fsa/icefall/tree/master/egs/yesno/ASR>`_
on CPU.
Data preparation
~~~~~~~~~~~~~~~~
.. code-block:: bash
$ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
$ cd /tmp/icefall
$ cd egs/yesno/AS
$ ./prepare.sh
The log of running ``./prepare.sh`` is:
.. code-block::
2021-08-23 19:27:26 (prepare.sh:24:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
2021-08-23 19:27:26 (prepare.sh:27:main) stage 0: Download data
Downloading waves_yesno.tar.gz: 4.49MB [00:03, 1.39MB/s]
2021-08-23 19:27:30 (prepare.sh:36:main) Stage 1: Prepare yesno manifest
2021-08-23 19:27:31 (prepare.sh:42:main) Stage 2: Compute fbank for yesno
2021-08-23 19:27:32,803 INFO [compute_fbank_yesno.py:52] Processing train
Extracting and storing features: 100%|_______________________________________________________________| 90/90 [00:01<00:00, 80.57it/s]
2021-08-23 19:27:34,085 INFO [compute_fbank_yesno.py:52] Processing test
Extracting and storing features: 100%|______________________________________________________________| 30/30 [00:00<00:00, 248.21it/s]
2021-08-23 19:27:34 (prepare.sh:48:main) Stage 3: Prepare lang
2021-08-23 19:27:35 (prepare.sh:63:main) Stage 4: Prepare G
/tmp/pip-install-fcordre9/kaldilm_6899d26f2d684ad48f21025950cd2866/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Rea
d(std::istream&):79
[I] Reading \data\ section.
/tmp/pip-install-fcordre9/kaldilm_6899d26f2d684ad48f21025950cd2866/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Rea
d(std::istream&):140
[I] Reading \1-grams: section.
2021-08-23 19:27:35 (prepare.sh:89:main) Stage 5: Compile HLG
2021-08-23 19:27:35,928 INFO [compile_hlg.py:120] Processing data/lang_phone
2021-08-23 19:27:35,929 INFO [lexicon.py:116] Converting L.pt to Linv.pt
2021-08-23 19:27:35,931 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
2021-08-23 19:27:35,932 INFO [compile_hlg.py:52] Loading G.fst.txt
2021-08-23 19:27:35,932 INFO [compile_hlg.py:62] Intersecting L and G
2021-08-23 19:27:35,933 INFO [compile_hlg.py:64] LG shape: (4, None)
2021-08-23 19:27:35,933 INFO [compile_hlg.py:66] Connecting LG
2021-08-23 19:27:35,933 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
2021-08-23 19:27:35,933 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
2021-08-23 19:27:35,933 INFO [compile_hlg.py:71] Determinizing LG
2021-08-23 19:27:35,934 INFO [compile_hlg.py:74] <class '_k2.RaggedInt'>
2021-08-23 19:27:35,934 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
2021-08-23 19:27:35,934 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
2021-08-23 19:27:35,934 INFO [compile_hlg.py:87] LG shape after k2.remove_epsilon: (6, None)
2021-08-23 19:27:35,935 INFO [compile_hlg.py:92] Arc sorting LG
2021-08-23 19:27:35,935 INFO [compile_hlg.py:95] Composing H and LG
2021-08-23 19:27:35,935 INFO [compile_hlg.py:102] Connecting LG
2021-08-23 19:27:35,935 INFO [compile_hlg.py:105] Arc sorting LG
2021-08-23 19:27:35,936 INFO [compile_hlg.py:107] HLG.shape: (8, None)
2021-08-23 19:27:35,936 INFO [compile_hlg.py:123] Saving HLG.pt to data/lang_phone
Training
~~~~~~~~
Now let us run the training part:
.. code-block::
$ export CUDA_VISIBLE_DEVICES=""
$ ./tdnn/train.py
.. CAUTION::
We use ``export CUDA_VISIBLE_DEVICES=""`` so that icefall uses CPU
even if there are GPUs available.
The training log is given below:
.. code-block::
2021-08-23 19:30:31,072 INFO [train.py:465] Training started
2021-08-23 19:30:31,072 INFO [train.py:466] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01,
'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, '
best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_doub
le_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'feature_dir': PosixPath('data/fbank'
), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0
, 'on_the_fly_feats': False, 'shuffle': True, 'return_cuts': True, 'num_workers': 2}
2021-08-23 19:30:31,074 INFO [lexicon.py:113] Loading pre-compiled data/lang_phone/Linv.pt
2021-08-23 19:30:31,098 INFO [asr_datamodule.py:146] About to get train cuts
2021-08-23 19:30:31,098 INFO [asr_datamodule.py:240] About to get train cuts
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:149] About to create train dataset
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:200] Using SingleCutSampler.
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:206] About to create train dataloader
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:219] About to get test cuts
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:246] About to get test cuts
2021-08-23 19:30:31,357 INFO [train.py:416] Epoch 0, batch 0, batch avg loss 1.0789, total avg loss: 1.0789, batch size: 4
2021-08-23 19:30:31,848 INFO [train.py:416] Epoch 0, batch 10, batch avg loss 0.5356, total avg loss: 0.7556, batch size: 4
2021-08-23 19:30:32,301 INFO [train.py:432] Epoch 0, valid loss 0.9972, best valid loss: 0.9972 best valid epoch: 0
2021-08-23 19:30:32,805 INFO [train.py:416] Epoch 0, batch 20, batch avg loss 0.2436, total avg loss: 0.5717, batch size: 3
2021-08-23 19:30:33,109 INFO [train.py:432] Epoch 0, valid loss 0.4167, best valid loss: 0.4167 best valid epoch: 0
2021-08-23 19:30:33,121 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-0.pt
2021-08-23 19:30:33,325 INFO [train.py:416] Epoch 1, batch 0, batch avg loss 0.2214, total avg loss: 0.2214, batch size: 5
2021-08-23 19:30:33,798 INFO [train.py:416] Epoch 1, batch 10, batch avg loss 0.0781, total avg loss: 0.1343, batch size: 5
2021-08-23 19:30:34,065 INFO [train.py:432] Epoch 1, valid loss 0.0859, best valid loss: 0.0859 best valid epoch: 1
2021-08-23 19:30:34,556 INFO [train.py:416] Epoch 1, batch 20, batch avg loss 0.0421, total avg loss: 0.0975, batch size: 3
2021-08-23 19:30:34,810 INFO [train.py:432] Epoch 1, valid loss 0.0431, best valid loss: 0.0431 best valid epoch: 1
2021-08-23 19:30:34,824 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-1.pt
... ...
2021-08-23 19:30:49,657 INFO [train.py:416] Epoch 13, batch 0, batch avg loss 0.0109, total avg loss: 0.0109, batch size: 5
2021-08-23 19:30:49,984 INFO [train.py:416] Epoch 13, batch 10, batch avg loss 0.0093, total avg loss: 0.0096, batch size: 4
2021-08-23 19:30:50,239 INFO [train.py:432] Epoch 13, valid loss 0.0104, best valid loss: 0.0101 best valid epoch: 12
2021-08-23 19:30:50,569 INFO [train.py:416] Epoch 13, batch 20, batch avg loss 0.0092, total avg loss: 0.0096, batch size: 2
2021-08-23 19:30:50,819 INFO [train.py:432] Epoch 13, valid loss 0.0101, best valid loss: 0.0101 best valid epoch: 13
2021-08-23 19:30:50,835 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-13.pt
2021-08-23 19:30:51,024 INFO [train.py:416] Epoch 14, batch 0, batch avg loss 0.0105, total avg loss: 0.0105, batch size: 5
2021-08-23 19:30:51,317 INFO [train.py:416] Epoch 14, batch 10, batch avg loss 0.0099, total avg loss: 0.0097, batch size: 4
2021-08-23 19:30:51,552 INFO [train.py:432] Epoch 14, valid loss 0.0108, best valid loss: 0.0101 best valid epoch: 13
2021-08-23 19:30:51,869 INFO [train.py:416] Epoch 14, batch 20, batch avg loss 0.0096, total avg loss: 0.0097, batch size: 5
2021-08-23 19:30:52,107 INFO [train.py:432] Epoch 14, valid loss 0.0102, best valid loss: 0.0101 best valid epoch: 13
2021-08-23 19:30:52,126 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-14.pt
2021-08-23 19:30:52,128 INFO [train.py:537] Done!
Decoding
~~~~~~~~
Let us use the trained model to decode the test set:
.. code-block::
$ ./tdnn/decode.py
The decoding log is:
.. code-block::
2021-08-23 19:35:30,192 INFO [decode.py:249] Decoding started
2021-08-23 19:35:30,192 INFO [decode.py:250] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': True, 'return_cuts': True, 'num_workers': 2}
2021-08-23 19:35:30,193 INFO [lexicon.py:113] Loading pre-compiled data/lang_phone/Linv.pt
2021-08-23 19:35:30,213 INFO [decode.py:259] device: cpu
2021-08-23 19:35:30,217 INFO [decode.py:279] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
/tmp/icefall/icefall/checkpoint.py:146: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch.
It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at /pytorch/aten/src/ATen/native/BinaryOps.cpp:450.)
avg[k] //= n
2021-08-23 19:35:30,220 INFO [asr_datamodule.py:219] About to get test cuts
2021-08-23 19:35:30,220 INFO [asr_datamodule.py:246] About to get test cuts
2021-08-23 19:35:30,409 INFO [decode.py:190] batch 0/8, cuts processed until now is 4
2021-08-23 19:35:30,571 INFO [decode.py:228] The transcripts are stored in tdnn/exp/recogs-test_set.txt
2021-08-23 19:35:30,572 INFO [utils.py:317] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
2021-08-23 19:35:30,573 INFO [decode.py:236] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
2021-08-23 19:35:30,573 INFO [decode.py:299] Done!
**Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.
Have fun with ``icefall``!

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

View File

@ -0,0 +1,18 @@
Recipes
=======
This page contains various recipes in ``icefall``.
Currently, only speech recognition recipes are provided.
We may add recipes for other tasks as well in the future.
.. we put the yesno recipe as the first recipe since it is the simplest one.
.. Other recipes are listed in a alphabetical order.
.. toctree::
:maxdepth: 2
yesno
librispeech

View File

@ -0,0 +1,2 @@
LibriSpeech
===========

View File

@ -0,0 +1,445 @@
yesno
=====
This page shows you how to run the ``yesno`` recipe. It contains:
- (1) Prepare data for training
- (2) Train a TDNN model
- (a) View text format logs and visualize TensorBoard logs
- (b) Select device type, i.e., CPU and GPU, for training
- (c) Change training options
- (d) Resume training from a checkpoint
- (3) Decode with a trained model
- (a) Select a checkpoint for decoding
- (b) Model averaging
- (4) Colab notebook
- (a) It shows you step by step how to setup the environment, how to do training,
and how to do decoding
- (b) How to use a pre-trained model
- (5) Inference with a pre-trained model
- (a) Download a pre-trained model, provided by us
- (b) Decode a single sound file with a pre-trained model
- (c) Decode multiple sound files at the same time
It does **NOT** show you:
- (1) How to train with multiple GPUs
The ``yesno`` dataset is so small that CPU is more than enough
for training as well as for decoding.
- (2) How to use LM rescoring for decoding
The dataset does not have an LM for rescoring.
.. HINT::
We assume you have read the page :ref:`install icefall` and have setup
the environment for ``icefall``.
.. HINT::
You **don't** need a **GPU** to run this recipe. It can be run on a **CPU**.
The training part takes less than 30 **seconds** on a CPU and you will get
the following WER at the end::
[test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
Data preparation
----------------
.. code-block:: bash
$ cd egs/yesno/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
The data preparation contains several stages, you can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/yesno/ASR
$ ./prepare.sh --stage 0 --stop-stage 0
means to run only stage 0.
To run stage 2 to stage 5, use:
.. code-block:: bash
$ ./prepare.sh --stage 2 --stop-stage 5
Training
--------
We provide only a TDNN model, contained in
the `tdnn <https://github.com/k2-fsa/icefall/tree/master/egs/yesno/ASR/tdnn>`_
folder, for ``yesno``.
The command to run the training part is:
.. code-block:: bash
$ cd egs/yesno/ASR
$ export CUDA_VISIBLE_DEVICES=""
$ ./tdnn/train.py
By default, it will run ``15`` epochs. Training logs and checkpoints are saved
in ``tdnn/exp``.
In ``tdnn/exp``, you will find the following files:
- ``epoch-0.pt``, ``epoch-1.pt``, ...
These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
.. code-block:: bash
$ ./tdnn/train.py --start-epoch 11
- ``tensorboard/``
This folder contains TensorBoard logs. Training loss, validation loss, learning
rate, etc, are recorded in these logs. You can visualize them by:
.. code-block:: bash
$ cd tdnn/exp/tensorboard
$ tensorboard dev upload --logdir . --description "TDNN training for yesno with icefall"
It will print something like below:
.. code-block::
TensorFlow installation not found - running with reduced feature set.
Upload started and will continue reading any new data as it's added to the logdir.
To stop uploading, press Ctrl-C.
New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/yKUbhb5wRmOSXYkId1z9eg/
[2021-08-23T23:49:41] Started scanning logdir.
[2021-08-23T23:49:42] Total uploaded: 135 scalars, 0 tensors, 0 binary objects
Listening for new data in logdir...
Note there is a URL in the above output, click it and you will see
the following screenshot:
.. figure:: images/yesno-tdnn-tensorboard-log.png
:width: 600
:alt: TensorBoard screenshot
:align: center
:target: https://tensorboard.dev/experiment/yKUbhb5wRmOSXYkId1z9eg/
TensorBoard screenshot.
- ``log/log-train-xxxx``
It is the detailed training log in text format, same as the one
you saw printed to the console during training.
.. NOTE::
By default, ``./tdnn/train.py`` uses GPU 0 for training if GPUs are available.
If you have two GPUs, say, GPU 0 and GPU 1, and you want to use GPU 1 for
training, you can run:
.. code-block:: bash
$ export CUDA_VISIBLE_DEVICES="1"
$ ./tdnn/train.py
Since the ``yesno`` dataset is very small, containing only 30 sound files
for training, and the model in use is also very small, we use:
.. code-block:: bash
$ export CUDA_VISIBLE_DEVICES=""
so that ``./tdnn/train.py`` uses CPU during training.
If you don't have GPUs, then you don't need to
run ``export CUDA_VISIBLE_DEVICES=""``.
To see available training options, you can use:
.. code-block:: bash
$ ./tdnn/train.py --help
Other training options, e.g., learning rate, results dir, etc., are
pre-configured in the function ``get_params()``
in `tdnn/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/yesno/ASR/tdnn/train.py>`_.
Normally, you don't need to change them. You can change them by modifying the code, if
you want.
Decoding
--------
The decoding part uses checkpoints saved by the training part, so you have
to run the training part first.
The command for decoding is:
.. code-block:: bash
$ export CUDA_VISIBLE_DEVICES=""
$ ./tdnn/decode.py
You will see the WER in the output log.
Decoded results are saved in ``tdnn/exp``.
.. code-block:: bash
$ ./tdnn/decode.py --help
shows you the available decoding options.
Some commonly used options are:
- ``--epoch``
You can select which checkpoint to be used for decoding.
For instance, ``./tdnn/decode.py --epoch 10`` means to use
``./tdnn/exp/epoch-10.pt`` for decoding.
- ``--avg``
It's related to model averaging. It specifies number of checkpoints
to be averaged. The averaged model is used for decoding.
For example, the following command:
.. code-block:: bash
$ ./tdnn/decode.py --epoch 10 --avg 3
uses the average of ``epoch-8.pt``, ``epoch-9.pt`` and ``epoch-10.pt``
for decoding.
- ``--export``
If it is ``True``, i.e., ``./tdnn/decode.py --export 1``, the code
will save the averaged model to ``tdnn/exp/pretrained.pt``.
See :ref:`yesno use a pre-trained model` for how to use it.
.. _yesno use a pre-trained model:
Pre-trained Model
-----------------
We have uploaded the pre-trained model to
`<https://huggingface.co/csukuangfj/icefall_asr_yesno_tdnn>`_.
The following shows you how to use the pre-trained model.
Download the pre-trained model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/yesno/ASR
$ mkdir tmp
$ cd tmp
$ git lfs install
$ git clone https://huggingface.co/csukuangfj/icefall_asr_yesno_tdnn
.. CAUTION::
You have to use ``git lfs`` to download the pre-trained model.
After downloading, you will have the following files:
.. code-block:: bash
$ cd egs/yesno/ASR
$ tree tmp
.. code-block:: bash
tmp/
`-- icefall_asr_yesno_tdnn
|-- README.md
|-- lang_phone
| |-- HLG.pt
| |-- L.pt
| |-- L_disambig.pt
| |-- Linv.pt
| |-- lexicon.txt
| |-- lexicon_disambig.txt
| |-- tokens.txt
| `-- words.txt
|-- lm
| |-- G.arpa
| `-- G.fst.txt
|-- pretrained.pt
`-- test_waves
|-- 0_0_0_1_0_0_0_1.wav
|-- 0_0_1_0_0_0_1_0.wav
|-- 0_0_1_0_0_1_1_1.wav
|-- 0_0_1_0_1_0_0_1.wav
|-- 0_0_1_1_0_0_0_1.wav
|-- 0_0_1_1_0_1_1_0.wav
|-- 0_0_1_1_1_0_0_0.wav
|-- 0_0_1_1_1_1_0_0.wav
|-- 0_1_0_0_0_1_0_0.wav
|-- 0_1_0_0_1_0_1_0.wav
|-- 0_1_0_1_0_0_0_0.wav
|-- 0_1_0_1_1_1_0_0.wav
|-- 0_1_1_0_0_1_1_1.wav
|-- 0_1_1_1_0_0_1_0.wav
|-- 0_1_1_1_1_0_1_0.wav
|-- 1_0_0_0_0_0_0_0.wav
|-- 1_0_0_0_0_0_1_1.wav
|-- 1_0_0_1_0_1_1_1.wav
|-- 1_0_1_1_0_1_1_1.wav
|-- 1_0_1_1_1_1_0_1.wav
|-- 1_1_0_0_0_1_1_1.wav
|-- 1_1_0_0_1_0_1_1.wav
|-- 1_1_0_1_0_1_0_0.wav
|-- 1_1_0_1_1_0_0_1.wav
|-- 1_1_0_1_1_1_1_0.wav
|-- 1_1_1_0_0_1_0_1.wav
|-- 1_1_1_0_1_0_1_0.wav
|-- 1_1_1_1_0_0_1_0.wav
|-- 1_1_1_1_1_0_0_0.wav
`-- 1_1_1_1_1_1_1_1.wav
4 directories, 42 files
.. code-block:: bash
$ soxi tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav
Input File : 'tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav'
Channels : 1
Sample Rate : 8000
Precision : 16-bit
Duration : 00:00:06.76 = 54080 samples ~ 507 CDDA sectors
File Size : 108k
Bit Rate : 128k
Sample Encoding: 16-bit Signed Integer PCM
- ``0_0_1_0_1_0_0_1.wav``
0 means No; 1 means Yes. No and Yes are not in English,
but in `Hebrew <https://en.wikipedia.org/wiki/Hebrew_language>`_.
So this file contains ``NO NO YES NO YES NO NO YES``.
Download kaldifeat
~~~~~~~~~~~~~~~~~~
`kaldifeat <https://github.com/csukuangfj/kaldifeat>`_ is used for extracting
features from a single or multiple sound files. Please refer to
`<https://github.com/csukuangfj/kaldifeat>`_ to install ``kaldifeat`` first.
Inference with a pre-trained model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/yesno/ASR
$ ./tdnn/pretrained.py --help
shows the usage information of ``./tdnn/pretrained.py``.
To decode a single file, we can use:
.. code-block:: bash
./tdnn/pretrained.py \
--checkpoint ./tmp/icefall_asr_yesno_tdnn/pretrained.pt \
--words-file ./tmp/icefall_asr_yesno_tdnn/lang_phone/words.txt \
--HLG ./tmp/icefall_asr_yesno_tdnn/lang_phone/HLG.pt \
./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav
The output is:
.. code-block::
2021-08-24 12:22:51,621 INFO [pretrained.py:119] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './tmp/icefall_asr_yesno_tdnn/pretrained.pt', 'words_file': './tmp/icefall_asr_yesno_tdnn/lang_phone/words.txt', 'HLG': './tmp/icefall_asr_yesno_tdnn/lang_phone/HLG.pt', 'sound_files': ['./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav']}
2021-08-24 12:22:51,645 INFO [pretrained.py:125] device: cpu
2021-08-24 12:22:51,645 INFO [pretrained.py:127] Creating model
2021-08-24 12:22:51,650 INFO [pretrained.py:139] Loading HLG from ./tmp/icefall_asr_yesno_tdnn/lang_phone/HLG.pt
2021-08-24 12:22:51,651 INFO [pretrained.py:143] Constructing Fbank computer
2021-08-24 12:22:51,652 INFO [pretrained.py:153] Reading sound files: ['./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav']
2021-08-24 12:22:51,684 INFO [pretrained.py:159] Decoding started
2021-08-24 12:22:51,708 INFO [pretrained.py:198]
./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav:
NO NO YES NO YES NO NO YES
2021-08-24 12:22:51,708 INFO [pretrained.py:200] Decoding Done
You can see that for the sound file ``0_0_1_0_1_0_0_1.wav``, the decoding result is
``NO NO YES NO YES NO NO YES``.
To decode **multiple** files at the same time, you can use
.. code-block:: bash
./tdnn/pretrained.py \
--checkpoint ./tmp/icefall_asr_yesno_tdnn/pretrained.pt \
--words-file ./tmp/icefall_asr_yesno_tdnn/lang_phone/words.txt \
--HLG ./tmp/icefall_asr_yesno_tdnn/lang_phone/HLG.pt \
./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav \
./tmp/icefall_asr_yesno_tdnn/test_waves/1_0_1_1_0_1_1_1.wav
The decoding output is:
.. code-block::
2021-08-24 12:25:20,159 INFO [pretrained.py:119] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './tmp/icefall_asr_yesno_tdnn/pretrained.pt', 'words_file': './tmp/icefall_asr_yesno_tdnn/lang_phone/words.txt', 'HLG': './tmp/icefall_asr_yesno_tdnn/lang_phone/HLG.pt', 'sound_files': ['./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav', './tmp/icefall_asr_yesno_tdnn/test_waves/1_0_1_1_0_1_1_1.wav']}
2021-08-24 12:25:20,181 INFO [pretrained.py:125] device: cpu
2021-08-24 12:25:20,181 INFO [pretrained.py:127] Creating model
2021-08-24 12:25:20,185 INFO [pretrained.py:139] Loading HLG from ./tmp/icefall_asr_yesno_tdnn/lang_phone/HLG.pt
2021-08-24 12:25:20,186 INFO [pretrained.py:143] Constructing Fbank computer
2021-08-24 12:25:20,187 INFO [pretrained.py:153] Reading sound files: ['./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav',
'./tmp/icefall_asr_yesno_tdnn/test_waves/1_0_1_1_0_1_1_1.wav']
2021-08-24 12:25:20,213 INFO [pretrained.py:159] Decoding started
2021-08-24 12:25:20,287 INFO [pretrained.py:198]
./tmp/icefall_asr_yesno_tdnn/test_waves/0_0_1_0_1_0_0_1.wav:
NO NO YES NO YES NO NO YES
./tmp/icefall_asr_yesno_tdnn/test_waves/1_0_1_1_0_1_1_1.wav:
YES NO YES YES NO YES YES YES
2021-08-24 12:25:20,287 INFO [pretrained.py:200] Decoding Done
You can see again that it decodes correctly.
Colab notebook
--------------
We do provide a colab notebook for this recipe.
|yesno colab notebook|
.. |yesno colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
:target: https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing
**Congratulations!** You have finished the simplest speech recognition recipe in ``icefall``.

View File

@ -20,6 +20,7 @@ from icefall.utils import (
get_texts,
setup_logger,
store_transcripts,
str2bool,
write_error_stats,
)
@ -44,6 +45,17 @@ def get_parser():
"consecutive checkpoints before the checkpoint specified by "
"'--epoch'. ",
)
parser.add_argument(
"--export",
type=str2bool,
default=False,
help="""When enabled, the averaged model is saved to
tdnn/exp/pretrained.pt. Note: only model.state_dict() is saved.
pretrained.pt contains a dict {"model": model.state_dict()},
which can be loaded by `icefall.checkpoint.load_checkpoint()`.
""",
)
return parser
@ -279,6 +291,12 @@ def main():
logging.info(f"averaging {filenames}")
model.load_state_dict(average_checkpoints(filenames))
if params.export:
logging.info(f"Export averaged model to {params.exp_dir}/pretrained.pt")
torch.save(
{"model": model.state_dict()}, f"{params.exp_dir}/pretrained.pt"
)
model.to(device)
model.eval()

209
egs/yesno/ASR/tdnn/pretrained.py Executable file
View File

@ -0,0 +1,209 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import math
from typing import List
import k2
import kaldifeat
import torch
import torchaudio
from model import Tdnn
from torch.nn.utils.rnn import pad_sequence
from icefall.decode import get_lattice, one_best_decoding
from icefall.utils import AttributeDict, get_texts
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--checkpoint",
type=str,
required=True,
help="Path to the checkpoint. "
"The checkpoint is assumed to be saved by "
"icefall.checkpoint.save_checkpoint().",
)
parser.add_argument(
"--words-file",
type=str,
required=True,
help="Path to words.txt",
)
parser.add_argument(
"--HLG", type=str, required=True, help="Path to HLG.pt."
)
parser.add_argument(
"sound_files",
type=str,
nargs="+",
help="The input sound file(s) to transcribe. "
"Supported formats are those supported by torchaudio.load(). "
"For example, wav and flac are supported. "
"The sample rate has to be 16kHz.",
)
return parser
def get_params() -> AttributeDict:
params = AttributeDict(
{
"feature_dim": 23,
"num_classes": 4, # [<blk>, N, SIL, Y]
"sample_rate": 8000,
"search_beam": 20,
"output_beam": 8,
"min_active_states": 30,
"max_active_states": 10000,
"use_double_scores": True,
}
)
return params
def read_sound_files(
filenames: List[str], expected_sample_rate: float
) -> List[torch.Tensor]:
"""Read a list of sound files into a list 1-D float32 torch tensors.
Args:
filenames:
A list of sound filenames.
expected_sample_rate:
The expected sample rate of the sound files.
Returns:
Return a list of 1-D float32 torch tensors.
"""
ans = []
for f in filenames:
wave, sample_rate = torchaudio.load(f)
assert sample_rate == expected_sample_rate, (
f"expected sample rate: {expected_sample_rate}. "
f"Given: {sample_rate}"
)
# We use only the first channel
ans.append(wave[0])
return ans
def main():
parser = get_parser()
args = parser.parse_args()
params = get_params()
params.update(vars(args))
logging.info(f"{params}")
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
logging.info(f"device: {device}")
logging.info("Creating model")
model = Tdnn(
num_features=params.feature_dim,
num_classes=params.num_classes,
)
checkpoint = torch.load(args.checkpoint, map_location="cpu")
model.load_state_dict(checkpoint["model"])
model.to(device)
model.eval()
logging.info(f"Loading HLG from {params.HLG}")
HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
HLG = HLG.to(device)
logging.info("Constructing Fbank computer")
opts = kaldifeat.FbankOptions()
opts.device = device
opts.frame_opts.dither = 0
opts.frame_opts.snip_edges = False
opts.frame_opts.samp_freq = params.sample_rate
opts.mel_opts.num_bins = params.feature_dim
fbank = kaldifeat.Fbank(opts)
logging.info(f"Reading sound files: {params.sound_files}")
waves = read_sound_files(
filenames=params.sound_files, expected_sample_rate=params.sample_rate
)
waves = [w.to(device) for w in waves]
logging.info("Decoding started")
features = fbank(waves)
features = pad_sequence(
features, batch_first=True, padding_value=math.log(1e-10)
)
# Note: We don't use key padding mask for attention during decoding
with torch.no_grad():
nnet_output = model(features)
batch_size = nnet_output.shape[0]
supervision_segments = torch.tensor(
[[i, 0, nnet_output.shape[1]] for i in range(batch_size)],
dtype=torch.int32,
)
lattice = get_lattice(
nnet_output=nnet_output,
HLG=HLG,
supervision_segments=supervision_segments,
search_beam=params.search_beam,
output_beam=params.output_beam,
min_active_states=params.min_active_states,
max_active_states=params.max_active_states,
)
best_path = one_best_decoding(
lattice=lattice, use_double_scores=params.use_double_scores
)
hyps = get_texts(best_path)
word_sym_table = k2.SymbolTable.from_file(params.words_file)
hyps = [[word_sym_table[i] for i in ids] for ids in hyps]
s = "\n"
for filename, hyp in zip(params.sound_files, hyps):
words = " ".join(hyp)
s += f"{filename}:\n{words}\n\n"
logging.info(s)
logging.info("Decoding Done")
if __name__ == "__main__":
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
main()

View File

@ -60,6 +60,16 @@ def get_parser():
help="Number of epochs to train.",
)
parser.add_argument(
"--start-epoch",
type=int,
default=0,
help="""Resume training from from this epoch.
If it is positive, it will load checkpoint from
tdnn/exp/epoch-{start_epoch-1}.pt
""",
)
return parser
@ -92,8 +102,6 @@ def get_params() -> AttributeDict:
- start_epoch: If it is not zero, load checkpoint `start_epoch-1`
and continue training from that checkpoint.
- num_epochs: Number of epochs to train.
- best_train_loss: Best training loss so far. It is used to select
the model that has the lowest training loss. It is
updated during the training.
@ -420,6 +428,19 @@ def train_one_epoch(
f"batch size: {batch_size}"
)
if tb_writer is not None:
tb_writer.add_scalar(
"train/current_loss",
loss_cpu / params.train_frames,
params.batch_idx_train,
)
tb_writer.add_scalar(
"train/tot_avg_loss",
tot_avg_loss,
params.batch_idx_train,
)
if batch_idx > 0 and batch_idx % params.valid_interval == 0:
compute_validation_loss(
params=params,
@ -434,6 +455,12 @@ def train_one_epoch(
f" best valid loss: {params.best_valid_loss:.4f} "
f"best valid epoch: {params.best_valid_epoch}"
)
if tb_writer is not None:
tb_writer.add_scalar(
"train/valid_loss",
params.valid_loss,
params.batch_idx_train,
)
params.train_loss = tot_loss / tot_frames