From dbf5dcaed972d53f65a5dc4b84bcd3625ddc5378 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 16 Jul 2021 21:23:33 +0800 Subject: [PATCH] Add documentation. --- README.md | 38 ++++-- doc/Makefile | 20 ++++ doc/make.bat | 35 ++++++ doc/requirements.txt | 6 + doc/source/code/test_fbank.py | 72 ++++++++++++ doc/source/conf.py | 104 +++++++++++++++++ doc/source/index.rst | 24 ++++ doc/source/installation.rst | 54 +++++++++ doc/source/usage.rst | 212 ++++++++++++++++++++++++++++++++++ 9 files changed, 554 insertions(+), 11 deletions(-) create mode 100644 doc/Makefile create mode 100644 doc/make.bat create mode 100644 doc/requirements.txt create mode 100755 doc/source/code/test_fbank.py create mode 100644 doc/source/conf.py create mode 100644 doc/source/index.rst create mode 100644 doc/source/installation.rst create mode 100644 doc/source/usage.rst diff --git a/README.md b/README.md index 744bdf9..d675805 100644 --- a/README.md +++ b/README.md @@ -4,20 +4,36 @@ Wrap kaldi's feature computations to Python with PyTorch support. # Installation -`kaldifeat` can be installed by +## From PyPi with pip + +If you install `kaldifeat` using `pip`, it will also install +PyTorch 1.8.1. If this is not what you want, please install `kaldifeat` +from source (see below). ```bash pip install kaldifeat ``` -# TODOs +## From source -- [ ] Add Python interface -- [ ] Support torch.device so that it can switch between CUDA and CPU -- [ ] Add unit tests -- [ ] Set up GitHub actions -- [ ] Benchmark its speed and compare it with Kaldi -- [ ] Support batch processing of multiple waves -- [ ] Handle non-default parameters -- [ ] Support MFCC and other features available in Kaldi -- [ ] Publish it to PyPI +The following are the commands to compile `kaldifeat` from source. +We assume that you have installed `cmake` and PyTorch. +cmake 3.11 is known to work. Other cmake versions may also work. +PyTorch 1.8.1 is known to work. Other PyTorch versions may also work. + +```bash +mkdir /some/path +git clone https://github.com/csukuangfj/kaldifeat.git +cd kaldifeat +python setup.py install +``` + +To test whether `kaldifeat` was installed successfully, you can run: +``` +python3 -c "import kaldifeat; print(kaldifeat.__version__)" +``` + +## Usage + +Please refer to +for how to use `kaldifeat`. diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..6247f7e --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 0000000..9d9bf4b --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1,6 @@ +dataclasses +recommonmark +sphinx +sphinx-autodoc-typehints +sphinx_rtd_theme +sphinxcontrib-bibtex diff --git a/doc/source/code/test_fbank.py b/doc/source/code/test_fbank.py new file mode 100755 index 0000000..0f39a1c --- /dev/null +++ b/doc/source/code/test_fbank.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +# Copyright 2021 Xiaomi Corporation (authors: Fangjun Kuang) + +import numpy as np +import soundfile as sf +import torch + +import kaldifeat + + +def read_wave(filename) -> torch.Tensor: + """Read a wave file and return it as a 1-D tensor. + + Note: + You don't need to scale it to [-32768, 32767]. + We use scaling here to follow the approach in Kaldi. + + Args: + filename: + Filename of a sound file. + Returns: + Return a 1-D tensor containing audio samples. + """ + with sf.SoundFile(filename) as sf_desc: + sampling_rate = sf_desc.samplerate + assert sampling_rate == 16000 + data = sf_desc.read(dtype=np.float32, always_2d=False) + data *= 32768 + return torch.from_numpy(data) + + +def test_fbank(): + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + + wave0 = read_wave("test_data/test.wav") + wave1 = read_wave("test_data/test2.wav") + + wave0 = wave0.to(device) + wave1 = wave1.to(device) + + opts = kaldifeat.FbankOptions() + opts.frame_opts.dither = 0 + opts.device = device + + fbank = kaldifeat.Fbank(opts) + + # We can compute fbank features in batches + features = fbank([wave0, wave1]) + assert isinstance(features, list), f"{type(features)}" + assert len(features) == 2 + + # We can also compute fbank features for a single wave + features0 = fbank(wave0) + features1 = fbank(wave1) + + assert torch.allclose(features[0], features0) + assert torch.allclose(features[1], features1) + + # To compute fbank features for only a specified frame + audio_frames = fbank.convert_samples_to_frames(wave0) + feature_frame_1 = fbank.compute(audio_frames[1]) + feature_frame_10 = fbank.compute(audio_frames[10]) + + assert torch.allclose(features0[1], feature_frame_1) + assert torch.allclose(features0[10], feature_frame_10) + + +if __name__ == "__main__": + test_fbank() diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 0000000..2ec9ca6 --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,104 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +import re + +import sphinx_rtd_theme + +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = "kaldifeat" +copyright = "2021, Fangjun Kuang" +author = "Fangjun Kuang" + + +def get_version(): + cmake_file = "../../CMakeLists.txt" + with open(cmake_file) as f: + content = f.read() + + version = re.search(r"set\(kaldifeat_VERSION (.*)\)", content).group(1) + return version.strip('"') + + +version = get_version() +release = version + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "recommonmark", + "sphinx.ext.autodoc", + "sphinx.ext.githubpages", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinx_rtd_theme", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} +master_doc = "index" + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +html_show_sourcelink = True + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +pygments_style = "sphinx" + +numfig = True + +html_context = { + "display_github": True, + "github_user": "csukuangfj", + "github_repo": "kaldifeat", + "github_version": "master", + "conf_py_path": "/kaldifeat/docs/source/", +} + +# refer to +# https://sphinx-rtd-theme.readthedocs.io/en/latest/configuring.html +html_theme_options = { + "logo_only": False, + "display_version": True, + "prev_next_buttons_location": "bottom", + "style_external_links": True, +} diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 0000000..436e1b8 --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,24 @@ +.. kaldifeat documentation master file, created by + sphinx-quickstart on Fri Jul 16 20:15:27 2021. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +kaldifeat +========= + +`kaldifeat `_ implements +feature extraction algorithms **compatible** with kaldi using PyTorch, supporting CUDA +as well as autograd. + +Currently, only fbank features are supported. +It can produce the same feature output as ``compute-fbank-feats`` (from kaldi) +when given the same options. + + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + installation + usage diff --git a/doc/source/installation.rst b/doc/source/installation.rst new file mode 100644 index 0000000..9e4bfc8 --- /dev/null +++ b/doc/source/installation.rst @@ -0,0 +1,54 @@ +Installation +============ + +.. _from source: + +Install kaldifeat from source +----------------------------- + +You have to install ``cmake`` and ``PyTorch`` first. + + - ``cmake`` 3.11 is known to work. Other CMake versions may also work. + - ``PyTorch`` 1.8.1 is known to work. Other PyTorch versions may also work. + - Python >= 3.6 + + +The commands to install ``kaldifeat`` from source are: + +.. code-block:: bash + + git clone https://github.com/csukuangfj/kaldifeat + cd kaldifeat + python3 setup.py install + +To test that you have installed ``kaldifeat`` successfully, please run: + +.. code-block:: bash + + python3 -c "import kaldifeat; print(kaldifeat.__version__)" + +It should print the version, e.g., ``1.0``. + +Install kaldifeat from PyPI +--------------------------- + +The pre-built ``kaldifeat`` hosted on PyPI uses PyTorch 1.8.1. +If you install ``kaldifeat`` using pip, it will replace your locally +installed PyTorch automatically with PyTorch 1.8.1. + +If you don't want this happen, please `Install kaldifeat from source`_. + +The command to install ``kaldifeat`` from PyPI is: + +.. code-block:: bash + + pip install kaldifeat + + +To test that you have installed ``kaldifeat`` successfully, please run: + +.. code-block:: bash + + python3 -c "import kaldifeat; print(kaldifeat.__version__)" + +It should print the version, e.g., ``1.0``. diff --git a/doc/source/usage.rst b/doc/source/usage.rst new file mode 100644 index 0000000..dd6a770 --- /dev/null +++ b/doc/source/usage.rst @@ -0,0 +1,212 @@ +Usage +===== + +Let us first see the help message of kaldi's ``compute-fbank-feats``: + +.. code-block:: bash + + $ compute-fbank-feats + + Create Mel-filter bank (FBANK) feature files. + Usage: compute-fbank-feats [options...] + + Options: + --allow-downsample : If true, allow the input waveform to have a higher frequency than the specified --sample-frequency (and we'll downsample). (bool, default = false) + --allow-upsample : If true, allow the input waveform to have a lower frequency than the specified --sample-frequency (and we'll upsample). (bool, default = false) + --blackman-coeff : Constant coefficient for generalized Blackman window. (float, default = 0.42) + --channel : Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (int, default = -1) + --debug-mel : Print out debugging information for mel bin computation (bool, default = false) + --dither : Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) + --energy-floor : Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) + --frame-length : Frame length in milliseconds (float, default = 25) + --frame-shift : Frame shift in milliseconds (float, default = 10) + --high-freq : High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) + --htk-compat : If true, put energy last. Warning: not sufficient to get HTK compatible features (need to change other parameters). (bool, default = false) + --low-freq : Low cutoff frequency for mel bins (float, default = 20) + --max-feature-vectors : Memory optimization. If larger than 0, periodically remove feature vectors so that only this number of the latest feature vectors is retained. (int, default = -1) + --min-duration : Minimum duration of segments to process (in seconds). (float, default = 0) + --num-mel-bins : Number of triangular mel-frequency bins (int, default = 23) + --output-format : Format of the output files [kaldi, htk] (string, default = "kaldi") + --preemphasis-coefficient : Coefficient for use in signal preemphasis (float, default = 0.97) + --raw-energy : If true, compute energy before preemphasis and windowing (bool, default = true) + --remove-dc-offset : Subtract mean from waveform on each frame (bool, default = true) + --round-to-power-of-two : If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) + --sample-frequency : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) + --snip-edges : If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) + --subtract-mean : Subtract mean of each feature file [CMS]; not recommended to do it this way. (bool, default = false) + --use-energy : Add an extra dimension with energy to the FBANK output. (bool, default = false) + --use-log-fbank : If true, produce log-filterbank, else produce linear. (bool, default = true) + --use-power : If true, use power, else use magnitude. (bool, default = true) + --utt2spk : Utterance to speaker-id map (if doing VTLN and you have warps per speaker) (string, default = "") + --vtln-high : High inflection point in piecewise linear VTLN warping function (if negative, offset from high-mel-freq (float, default = -500) + --vtln-low : Low inflection point in piecewise linear VTLN warping function (float, default = 100) + --vtln-map : Map from utterance or speaker-id to vtln warp factor (rspecifier) (string, default = "") + --vtln-warp : Vtln warp factor (only applicable if vtln-map not specified) (float, default = 1) + --window-type : Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") + --write-utt2dur : Wspecifier to write duration of each utterance in seconds, e.g. 'ark,t:utt2dur'. (string, default = "") + + Standard options: + --config : Configuration file to read (this option may be repeated) (string, default = "") + --help : Print out usage message (bool, default = false) + --print-args : Print the command line arguments (to stderr) (bool, default = true) + --verbose : Verbose level (higher->more logging) (int, default = 0) + +FbankOptions +------------ + +``kaldifeat`` reuses the same options from kaldi's ``compute-fbank-feats``. + +The following shows the default values of ``kaldifeat.FbankOptions``: + +.. code-block:: python + + >>> import kaldifeat + >>> fbank_opts = kaldifeat.FbankOptions() + >>> print(fbank_opts) + frame_opts: + samp_freq: 16000 + frame_shift_ms: 10 + frame_length_ms: 25 + dither: 1 + preemph_coeff: 0.97 + remove_dc_offset: 1 + window_type: povey + round_to_power_of_two: 1 + blackman_coeff: 0.42 + snip_edges: 1 + + + mel_opts: + num_bins: 23 + low_freq: 20 + high_freq: 0 + vtln_low: 100 + vtln_high: -500 + debug_mel: 0 + htk_mode: 0 + + use_energy: 0 + energy_floor: 0 + raw_energy: 1 + htk_compat: 0 + use_log_fbank: 1 + use_power: 1 + device: cpu + +It consists of three parts: + + - ``frame_opts`` + + Options in this part are accessed by ``frame_opts.xxx``. That is, to access + the sample rate, you use: + + .. code-block:: python + + >>> fbank_opts = kaldifeat.FbankOptions() + >>> print(fbank_opts.frame_opts.samp_freq) + 16000.0 + + - ``mel_opts`` + + Options in this part are accessed by ``mel_opts.xxx``. That is, to access + the number of mel bins, you use: + + .. code-block:: python + + >>> fbank_opts = kaldifeat.FbankOptions() + >>> print(fbank_opts.mel_opts.num_bins) + 23 + + - fbank related + + Options in this part are accessed directly. That is, to access the device + field, you use: + + .. code-block:: + + >>> print(fbank_opts.device) + cpu + >>> fbank_opts.device = 'cuda:0' + >>> print(fbank_opts.device) + cuda:0 + >>> import torch + >>> fbank_opts.device = torch.device('cuda', 0) + >>> print(fbank_opts.device) + cuda:0 + + + +To change the sample rate to 8000, you can use: + +.. code-block:: python + + >>> fbank_opts = kaldifeat.FbankOptions() + >>> print(fbank_opts.frame_opts.samp_freq) + 16000.0 + >>> fbank_opts.frame_opts.samp_freq = 8000 + >>> print(fbank_opts.frame_opts.samp_freq) + 8000.0 + +To change ``snip_edges`` to ``False``, you can use: + +.. code-block:: python + + >>> fbank_opts.frame_opts.snip_edges = False + >>> print(fbank_opts.frame_opts.snip_edges) + False + +To change number of mel bins to 80, you can use: + +.. code-block:: python + + >>> print(fbank_opts.mel_opts.num_bins) + 23 + >>> fbank_opts.mel_opts.num_bins = 80 + >>> print(fbank_opts.mel_opts.num_bins) + 80 + +To change the device to ``cuda``, you can use: + + +Fbank +----- + +The following shows how to use ``kaldifeat.Fbank`` to compute +the fbank features of sound files. + +First, let us generate two sound files using ``sox``: + +.. code-block:: bash + + # generate a wav of two seconds, containing a sine-wave + # swept from 300 Hz to 3300 Hz + sox -n -r 16000 -b 16 test.wav synth 1.2 sine 300-3300 + + # another sound file with 0.5 seconds + sox -n -r 16000 -b 16 test2.wav synth 0.5 sine 300-3300 + +.. hint:: + + You can find the above two files by visiting the following two links: + + - `test.wav `_ + - `test2.wav `_ + +The `following code `_ +shows the usage of ``kaldifeat.Fbank``. + +It shows: + + - How to read a sound file. Note that audio samples are scaled to the range [-32768, 32768]. + The intention is to produce the same output as kaldi. You don't need to scale it if + you don't care about the compatibility with kaldi + + - ``kaldifeat.Fbank`` supports CUDA as well as CPU + + - ``kaldifeat.Fbank`` supports processing sound file in a batch as well as accepting + a single sound file + + +.. literalinclude:: ./code/test_fbank.py + :caption: Demo of ``kaldifeat.Fbank`` + :language: python