diff --git a/.github/workflows/build-doc.yml b/.github/workflows/build-doc.yml new file mode 100644 index 0000000..869b5c4 --- /dev/null +++ b/.github/workflows/build-doc.yml @@ -0,0 +1,62 @@ +# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) + +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# refer to https://github.com/actions/starter-workflows/pull/47/files + +# You can access it at https://csukuangfj.github.io/kaldifeat +name: Generate doc +on: + push: + branches: + - master + - doc + +jobs: + build-doc: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: [3.8] + steps: + # refer to https://github.com/actions/checkout + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Display Python version + run: python -c "import sys; print(sys.version)" + + - name: Build doc + shell: bash + run: | + cd doc + python3 -m pip install -r ./requirements.txt + make html + touch build/html/.nojekyll + + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./doc/build/html + publish_branch: gh-pages diff --git a/README.md b/README.md index 9513b14..ab6c794 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ +[![Documentation Status](https://github.com/csukuangfj/kaldifeat/actions/workflows/build-doc.yml/badge.svg)](https://csukuangfj.github.io/kaldifeat/) + +**Documentation**: + @@ -277,98 +281,6 @@ See +for installation. diff --git a/doc/source/_static/.gitkeep b/doc/source/_static/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/source/code/test_fbank.py b/doc/source/code/test_fbank.py deleted file mode 100755 index 0f39a1c..0000000 --- a/doc/source/code/test_fbank.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2021 Xiaomi Corporation (authors: Fangjun Kuang) - -import numpy as np -import soundfile as sf -import torch - -import kaldifeat - - -def read_wave(filename) -> torch.Tensor: - """Read a wave file and return it as a 1-D tensor. - - Note: - You don't need to scale it to [-32768, 32767]. - We use scaling here to follow the approach in Kaldi. - - Args: - filename: - Filename of a sound file. - Returns: - Return a 1-D tensor containing audio samples. - """ - with sf.SoundFile(filename) as sf_desc: - sampling_rate = sf_desc.samplerate - assert sampling_rate == 16000 - data = sf_desc.read(dtype=np.float32, always_2d=False) - data *= 32768 - return torch.from_numpy(data) - - -def test_fbank(): - device = torch.device("cpu") - if torch.cuda.is_available(): - device = torch.device("cuda", 0) - - wave0 = read_wave("test_data/test.wav") - wave1 = read_wave("test_data/test2.wav") - - wave0 = wave0.to(device) - wave1 = wave1.to(device) - - opts = kaldifeat.FbankOptions() - opts.frame_opts.dither = 0 - opts.device = device - - fbank = kaldifeat.Fbank(opts) - - # We can compute fbank features in batches - features = fbank([wave0, wave1]) - assert isinstance(features, list), f"{type(features)}" - assert len(features) == 2 - - # We can also compute fbank features for a single wave - features0 = fbank(wave0) - features1 = fbank(wave1) - - assert torch.allclose(features[0], features0) - assert torch.allclose(features[1], features1) - - # To compute fbank features for only a specified frame - audio_frames = fbank.convert_samples_to_frames(wave0) - feature_frame_1 = fbank.compute(audio_frames[1]) - feature_frame_10 = fbank.compute(audio_frames[10]) - - assert torch.allclose(features0[1], feature_frame_1) - assert torch.allclose(features0[10], feature_frame_10) - - -if __name__ == "__main__": - test_fbank() diff --git a/doc/source/conf.py b/doc/source/conf.py index 2ec9ca6..fef6d6f 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -59,7 +59,7 @@ templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns = ["images/*.md"] source_suffix = { ".rst": "restructuredtext", @@ -102,3 +102,35 @@ html_theme_options = { "prev_next_buttons_location": "bottom", "style_external_links": True, } + +rst_epilog = """ +.. _kaldifeat: https://github.com/csukuangfj/kaldifeat +.. _Kaldi: https://github.com/kaldi-asr/kaldi +.. _PyTorch: https://pytorch.org/ +.. _kaldifeat.Fbank: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/fbank.py#L10 +.. _kaldifeat.Mfcc: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/mfcc.py#L10 +.. _kaldifeat.Plp: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/plp.py#L10 +.. _kaldifeat.Spectrogram: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/spectrogram.py#L9 +.. _kaldifeat.OnlineFbank: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/fbank.py#L16 +.. _kaldifeat.OnlineMfcc: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/mfcc.py#L16 +.. _kaldifeat.OnlinePlp: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/plp.py#L16 +.. _compute-fbank-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-fbank-feats.cc +.. _compute-mfcc-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-mfcc-feats.cc +.. _compute-plp-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-plp-feats.cc +.. _compute-spectrogram-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-spectrogram-feats.cc +.. _kaldi::OnlineFbank: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/online-feature.h#L160 +.. _kaldi::OnlineMfcc: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/online-feature.h#L158 +.. _kaldi::OnlinePlp: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/online-feature.h#L159 +.. _kaldifeat.FbankOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-fbank.h#L19 +.. _kaldi::FbankOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.h#L41 +.. _kaldifeat.MfccOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-mfcc.h#L22 +.. _kaldi::MfccOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-mfcc.h#L38 +.. _kaldifeat.PlpOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-plp.h#L24 +.. _kaldi::PlpOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-plp.h#L42 +.. _kaldifeat.SpectrogramOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-spectrogram.h#L18 +.. _kaldi::SpectrogramOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-spectrogram.h#L38 +.. _kaldifeat.FrameExtractionOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-window.h#L30 +.. _kaldi::FrameExtractionOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-window.h#L35 +.. _kaldifeat.MelBanksOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/mel-computations.h#L17 +.. _kaldi::MelBanksOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/mel-computations.h#L43 +""" diff --git a/doc/source/index.rst b/doc/source/index.rst index 436e1b8..caa50b5 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -6,19 +6,11 @@ kaldifeat ========= -`kaldifeat `_ implements -feature extraction algorithms **compatible** with kaldi using PyTorch, supporting CUDA -as well as autograd. - -Currently, only fbank features are supported. -It can produce the same feature output as ``compute-fbank-feats`` (from kaldi) -when given the same options. - - .. toctree:: :maxdepth: 2 - :caption: Contents: + :caption: Contents + intro installation - usage + usage/index diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 9e4bfc8..5baa217 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -1,19 +1,38 @@ Installation ============ + - |os_types| + - |python_versions| + - |pytorch_versions| + - |cuda_versions| + +.. caution:: + + `kaldifeat`_ depends on `PyTorch`_. `PyTorch`_ >= 1.5.0 is known to work. + + Please first install `PyTorch`_ before you install `kaldifeat`_. + +.. hint:: + + To install a CPU version of `kaldifeat`_, please install a CPU version + of `PyTorch`_. + + To install a CUDA version of `kaldifeat`_, please install a CUDA version + of `PyTorch`_. CUDA >= 10.1 is known to work. + .. _from source: Install kaldifeat from source ----------------------------- -You have to install ``cmake`` and ``PyTorch`` first. +You have to install ``cmake`` and `PyTorch`_ first. - ``cmake`` 3.11 is known to work. Other CMake versions may also work. - - ``PyTorch`` 1.8.1 is known to work. Other PyTorch versions may also work. + - `PyTorch`_ >= 1.5.0 is known to work. Other PyTorch versions may also work. - Python >= 3.6 -The commands to install ``kaldifeat`` from source are: +The commands to install `kaldifeat`_ from source are: .. code-block:: bash @@ -21,7 +40,7 @@ The commands to install ``kaldifeat`` from source are: cd kaldifeat python3 setup.py install -To test that you have installed ``kaldifeat`` successfully, please run: +To test that you have installed `kaldifeat`_ successfully, please run: .. code-block:: bash @@ -29,26 +48,120 @@ To test that you have installed ``kaldifeat`` successfully, please run: It should print the version, e.g., ``1.0``. +.. _from PyPI: + Install kaldifeat from PyPI --------------------------- -The pre-built ``kaldifeat`` hosted on PyPI uses PyTorch 1.8.1. -If you install ``kaldifeat`` using pip, it will replace your locally -installed PyTorch automatically with PyTorch 1.8.1. - -If you don't want this happen, please `Install kaldifeat from source`_. - -The command to install ``kaldifeat`` from PyPI is: +The command to install `kaldifeat`_ from PyPI is: .. code-block:: bash - pip install kaldifeat + pip install --verbose kaldifeat - -To test that you have installed ``kaldifeat`` successfully, please run: +To test that you have installed `kaldifeat`_ successfully, please run: .. code-block:: bash python3 -c "import kaldifeat; print(kaldifeat.__version__)" It should print the version, e.g., ``1.0``. + +Install kaldifeat from conda (Only for Linux) +--------------------------------------------- + +.. hint:: + + Installation using ``conda`` supports only Linux. For macOS and Windows, + please use either :ref:`from source` or :ref:`from PyPI`. + +The command to install `kaldifeat` using ``conda`` is + +.. code-block:: bash + + conda install -c kaldifeat -c pytorch -c conda-forge kaldifeat python=3.8 cudatoolkit=11.1 pytorch=1.8.1 + +You can select the supported Python version, CUDA toolkit version and `PyTorch`_ version as you wish. + +To install a CPU version of `kaldifeat`, use: + +.. code-block:: bash + + conda install -c kaldifeat -c pytorch cpuonly kaldifeat python=3.8 pytorch=1.8.1 + +.. caution:: + + If you encounter issues about missing GLIBC after installing `kaldifeat`_ + with ``conda``, please consider :ref:`from source` or :ref:`from PyPI`. + The reason is that the package was built using Ubuntu 18.04 and your system's + GLIBC is older. + + +.. |os_types| image:: ./images/os-green.svg + :alt: Supported operating systems + +.. |python_versions| image:: ./images/python_ge_3.6-blue.svg + :alt: Supported python versions + +.. |cuda_versions| image:: ./images/cuda_ge_10.1-orange.svg + :alt: Supported cuda versions + +.. |pytorch_versions| image:: ./images/pytorch_ge_1.5.0-green.svg + :alt: Supported pytorch versions + +To test that you have installed `kaldifeat`_ successfully, please run: + +.. code-block:: bash + + python3 -c "import kaldifeat; print(kaldifeat.__version__)" + +It should print the version, e.g., ``1.0``. + +FAQs +---- + +How to install a CUDA version of kaldifeat +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You need to first install a CUDA version of `PyTorch`_ and then install `kaldifeat`_. + +.. note:: + + You can use a CUDA version of `kaldifeat`_ on machines with no GPUs. + +How to install a CPU version of kaldifeat +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You need to first install a CPU version of `PyTorch`_ and then install `kaldifeat`_. + +How to fix `Caffe2: Cannot find cuDNN library` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: + + Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN + libraries. Please set the proper cuDNN prefixes and / or install cuDNN. + +You will have such an error when you want to install a CUDA version of `kaldifeat`_ +by ``pip install kaldifeat`` or from source. + +You need to first install cuDNN. Assume you have installed cuDNN to the +path ``/path/to/cudnn``. You can fix the error by using ``one`` of the following +commands. + +(1) Fix for installation using ``pip install`` + +.. code-block:: bash + + export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DCUDNN_LIBRARY_PATH=/path/to/cudnn/lib/libcudnn.so -DCUDNN_INCLUDE_PATH=/path/to/cudnn/include" + pip install --verbose kaldifeat + +(2) Fix for installation from source + +.. code-block:: bash + + mkdir /some/path + git clone https://github.com/csukuangfj/kaldifeat.git + cd kaldifeat + export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DCUDNN_LIBRARY_PATH=/path/to/cudnn/lib/libcudnn.so -DCUDNN_INCLUDE_PATH=/path/to/cudnn/include" + python setup.py install diff --git a/doc/source/intro.rst b/doc/source/intro.rst new file mode 100644 index 0000000..6e66c36 --- /dev/null +++ b/doc/source/intro.rst @@ -0,0 +1,103 @@ +Introduction +============ + +`kaldifeat`_ implements +speech feature extraction algorithms **compatible** with `Kaldi`_ using `PyTorch`_, +supporting CUDA as well as autograd. + +`kaldifeat`_ has the following features: + + - Fully compatible with `Kaldi`_ + + .. note:: + + The underlying C++ code is copied & modified from `Kaldi`_ directly. + It is rewritten with `PyTorch` C++ APIs. + + - Provide not only ``C++ APIs`` but also ``Python APIs`` + + .. note:: + + You can access `kaldifeat`_ from ``Python``. + + - Support autograd + - Support ``CUDA`` and ``CPU`` + + .. note:: + + You can use CUDA for feature extraction. + + - Support ``online`` (i.e., ``streaming``) and ``offline`` (i.e., ``non-streaming``) + feature extraction + - Support chunk-based processing + + .. note:: + + This is especially usefull if you want to process audios of several + hours long, which may cause OOM if you send them for computation at once. + With chunk-based processing, you can process audios of arbirtray length. + + - Support batch processing + + .. note:: + + With `kaldifeat`_ you can extract features for a batch of audios + + +.. see https://sublime-and-sphinx-guide.readthedocs.io/en/latest/tables.html + +Currently implemented speech features and their counterparts in `Kaldi`_ are +listed in the following table. + +.. list-table:: Supported speech features + :widths: 50 50 + :header-rows: 1 + + * - Supported speech features + - Counterpart in `Kaldi`_ + * - `kaldifeat.Fbank`_ + - `compute-fbank-feats`_ + * - `kaldifeat.Mfcc`_ + - `compute-mfcc-feats`_ + * - `kaldifeat.Plp`_ + - `compute-plp-feats`_ + * - `kaldifeat.Spectrogram`_ + - `compute-spectrogram-feats`_ + * - `kaldifeat.OnlineFbank`_ + - `kaldi::OnlineFbank`_ + * - `kaldifeat.OnlineMfcc`_ + - `kaldi::OnlineMfcc`_ + * - `kaldifeat.OnlinePlp`_ + - `kaldi::OnlinePlp`_ + +Each feature computer needs an option. The following table lists the options +for each computer and the corresponding options in `Kaldi`_. + +.. hint:: + + Note that we reuse the parameter names from `Kaldi`_. + + Also, both online feature computers and offline feature computers share the + same option. + +.. list-table:: Feature computer options + :widths: 50 50 + :header-rows: 1 + + * - Options in `kaldifeat`_ + - Corresponding options in `Kaldi`_ + * - `kaldifeat.FbankOptions`_ + - `kaldi::FbankOptions`_ + * - `kaldifeat.MfccOptions`_ + - `kaldi::MfccOptions`_ + * - `kaldifeat.PlpOptions`_ + - `kaldi::PlpOptions`_ + * - `kaldifeat.SpectrogramOptions`_ + - `kaldi::SpectrogramOptions`_ + * - `kaldifeat.FrameExtractionOptions`_ + - `kaldi::FrameExtractionOptions`_ + * - `kaldifeat.MelBanksOptions`_ + - `kaldi::MelBanksOptions`_ + +Read more to learn how to install `kaldifeat`_ and how to use each feature +computer. diff --git a/doc/source/usage.rst b/doc/source/usage.rst deleted file mode 100644 index dd6a770..0000000 --- a/doc/source/usage.rst +++ /dev/null @@ -1,212 +0,0 @@ -Usage -===== - -Let us first see the help message of kaldi's ``compute-fbank-feats``: - -.. code-block:: bash - - $ compute-fbank-feats - - Create Mel-filter bank (FBANK) feature files. - Usage: compute-fbank-feats [options...] - - Options: - --allow-downsample : If true, allow the input waveform to have a higher frequency than the specified --sample-frequency (and we'll downsample). (bool, default = false) - --allow-upsample : If true, allow the input waveform to have a lower frequency than the specified --sample-frequency (and we'll upsample). (bool, default = false) - --blackman-coeff : Constant coefficient for generalized Blackman window. (float, default = 0.42) - --channel : Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (int, default = -1) - --debug-mel : Print out debugging information for mel bin computation (bool, default = false) - --dither : Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) - --energy-floor : Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) - --frame-length : Frame length in milliseconds (float, default = 25) - --frame-shift : Frame shift in milliseconds (float, default = 10) - --high-freq : High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) - --htk-compat : If true, put energy last. Warning: not sufficient to get HTK compatible features (need to change other parameters). (bool, default = false) - --low-freq : Low cutoff frequency for mel bins (float, default = 20) - --max-feature-vectors : Memory optimization. If larger than 0, periodically remove feature vectors so that only this number of the latest feature vectors is retained. (int, default = -1) - --min-duration : Minimum duration of segments to process (in seconds). (float, default = 0) - --num-mel-bins : Number of triangular mel-frequency bins (int, default = 23) - --output-format : Format of the output files [kaldi, htk] (string, default = "kaldi") - --preemphasis-coefficient : Coefficient for use in signal preemphasis (float, default = 0.97) - --raw-energy : If true, compute energy before preemphasis and windowing (bool, default = true) - --remove-dc-offset : Subtract mean from waveform on each frame (bool, default = true) - --round-to-power-of-two : If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) - --sample-frequency : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) - --snip-edges : If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) - --subtract-mean : Subtract mean of each feature file [CMS]; not recommended to do it this way. (bool, default = false) - --use-energy : Add an extra dimension with energy to the FBANK output. (bool, default = false) - --use-log-fbank : If true, produce log-filterbank, else produce linear. (bool, default = true) - --use-power : If true, use power, else use magnitude. (bool, default = true) - --utt2spk : Utterance to speaker-id map (if doing VTLN and you have warps per speaker) (string, default = "") - --vtln-high : High inflection point in piecewise linear VTLN warping function (if negative, offset from high-mel-freq (float, default = -500) - --vtln-low : Low inflection point in piecewise linear VTLN warping function (float, default = 100) - --vtln-map : Map from utterance or speaker-id to vtln warp factor (rspecifier) (string, default = "") - --vtln-warp : Vtln warp factor (only applicable if vtln-map not specified) (float, default = 1) - --window-type : Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") - --write-utt2dur : Wspecifier to write duration of each utterance in seconds, e.g. 'ark,t:utt2dur'. (string, default = "") - - Standard options: - --config : Configuration file to read (this option may be repeated) (string, default = "") - --help : Print out usage message (bool, default = false) - --print-args : Print the command line arguments (to stderr) (bool, default = true) - --verbose : Verbose level (higher->more logging) (int, default = 0) - -FbankOptions ------------- - -``kaldifeat`` reuses the same options from kaldi's ``compute-fbank-feats``. - -The following shows the default values of ``kaldifeat.FbankOptions``: - -.. code-block:: python - - >>> import kaldifeat - >>> fbank_opts = kaldifeat.FbankOptions() - >>> print(fbank_opts) - frame_opts: - samp_freq: 16000 - frame_shift_ms: 10 - frame_length_ms: 25 - dither: 1 - preemph_coeff: 0.97 - remove_dc_offset: 1 - window_type: povey - round_to_power_of_two: 1 - blackman_coeff: 0.42 - snip_edges: 1 - - - mel_opts: - num_bins: 23 - low_freq: 20 - high_freq: 0 - vtln_low: 100 - vtln_high: -500 - debug_mel: 0 - htk_mode: 0 - - use_energy: 0 - energy_floor: 0 - raw_energy: 1 - htk_compat: 0 - use_log_fbank: 1 - use_power: 1 - device: cpu - -It consists of three parts: - - - ``frame_opts`` - - Options in this part are accessed by ``frame_opts.xxx``. That is, to access - the sample rate, you use: - - .. code-block:: python - - >>> fbank_opts = kaldifeat.FbankOptions() - >>> print(fbank_opts.frame_opts.samp_freq) - 16000.0 - - - ``mel_opts`` - - Options in this part are accessed by ``mel_opts.xxx``. That is, to access - the number of mel bins, you use: - - .. code-block:: python - - >>> fbank_opts = kaldifeat.FbankOptions() - >>> print(fbank_opts.mel_opts.num_bins) - 23 - - - fbank related - - Options in this part are accessed directly. That is, to access the device - field, you use: - - .. code-block:: - - >>> print(fbank_opts.device) - cpu - >>> fbank_opts.device = 'cuda:0' - >>> print(fbank_opts.device) - cuda:0 - >>> import torch - >>> fbank_opts.device = torch.device('cuda', 0) - >>> print(fbank_opts.device) - cuda:0 - - - -To change the sample rate to 8000, you can use: - -.. code-block:: python - - >>> fbank_opts = kaldifeat.FbankOptions() - >>> print(fbank_opts.frame_opts.samp_freq) - 16000.0 - >>> fbank_opts.frame_opts.samp_freq = 8000 - >>> print(fbank_opts.frame_opts.samp_freq) - 8000.0 - -To change ``snip_edges`` to ``False``, you can use: - -.. code-block:: python - - >>> fbank_opts.frame_opts.snip_edges = False - >>> print(fbank_opts.frame_opts.snip_edges) - False - -To change number of mel bins to 80, you can use: - -.. code-block:: python - - >>> print(fbank_opts.mel_opts.num_bins) - 23 - >>> fbank_opts.mel_opts.num_bins = 80 - >>> print(fbank_opts.mel_opts.num_bins) - 80 - -To change the device to ``cuda``, you can use: - - -Fbank ------ - -The following shows how to use ``kaldifeat.Fbank`` to compute -the fbank features of sound files. - -First, let us generate two sound files using ``sox``: - -.. code-block:: bash - - # generate a wav of two seconds, containing a sine-wave - # swept from 300 Hz to 3300 Hz - sox -n -r 16000 -b 16 test.wav synth 1.2 sine 300-3300 - - # another sound file with 0.5 seconds - sox -n -r 16000 -b 16 test2.wav synth 0.5 sine 300-3300 - -.. hint:: - - You can find the above two files by visiting the following two links: - - - `test.wav `_ - - `test2.wav `_ - -The `following code `_ -shows the usage of ``kaldifeat.Fbank``. - -It shows: - - - How to read a sound file. Note that audio samples are scaled to the range [-32768, 32768]. - The intention is to produce the same output as kaldi. You don't need to scale it if - you don't care about the compatibility with kaldi - - - ``kaldifeat.Fbank`` supports CUDA as well as CPU - - - ``kaldifeat.Fbank`` supports processing sound file in a batch as well as accepting - a single sound file - - -.. literalinclude:: ./code/test_fbank.py - :caption: Demo of ``kaldifeat.Fbank`` - :language: python diff --git a/doc/source/usage/code/compute-fbank-feats-help.txt b/doc/source/usage/code/compute-fbank-feats-help.txt new file mode 100644 index 0000000..3922636 --- /dev/null +++ b/doc/source/usage/code/compute-fbank-feats-help.txt @@ -0,0 +1,46 @@ +compute-fbank-feats + +Create Mel-filter bank (FBANK) feature files. +Usage: compute-fbank-feats [options...] + +Options: + --allow-downsample : If true, allow the input waveform to have a higher frequency than the specified --sample-frequency (and we'll downsample). (bool, default = false) + --allow-upsample : If true, allow the input waveform to have a lower frequency than the specified --sample-frequency (and we'll upsample). (bool, default = false) + --blackman-coeff : Constant coefficient for generalized Blackman window. (float, default = 0.42) + --channel : Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (int, default = -1) + --debug-mel : Print out debugging information for mel bin computation (bool, default = false) + --dither : Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) + --energy-floor : Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) + --frame-length : Frame length in milliseconds (float, default = 25) + --frame-shift : Frame shift in milliseconds (float, default = 10) + --high-freq : High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) + --htk-compat : If true, put energy last. Warning: not sufficient to get HTK compatible features (need to change other parameters). (bool, default = false) + --low-freq : Low cutoff frequency for mel bins (float, default = 20) + --max-feature-vectors : Memory optimization. If larger than 0, periodically remove feature vectors so that only this number of the latest feature vectors is retained. (int, default = -1) + --min-duration : Minimum duration of segments to process (in seconds). (float, default = 0) + --num-mel-bins : Number of triangular mel-frequency bins (int, default = 23) + --output-format : Format of the output files [kaldi, htk] (string, default = "kaldi") + --preemphasis-coefficient : Coefficient for use in signal preemphasis (float, default = 0.97) + --raw-energy : If true, compute energy before preemphasis and windowing (bool, default = true) + --remove-dc-offset : Subtract mean from waveform on each frame (bool, default = true) + --round-to-power-of-two : If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) + --sample-frequency : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) + --snip-edges : If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) + --subtract-mean : Subtract mean of each feature file [CMS]; not recommended to do it this way. (bool, default = false) + --use-energy : Add an extra dimension with energy to the FBANK output. (bool, default = false) + --use-log-fbank : If true, produce log-filterbank, else produce linear. (bool, default = true) + --use-power : If true, use power, else use magnitude. (bool, default = true) + --utt2spk : Utterance to speaker-id map (if doing VTLN and you have warps per speaker) (string, default = "") + --vtln-high : High inflection point in piecewise linear VTLN warping function (if negative, offset from high-mel-freq (float, default = -500) + --vtln-low : Low inflection point in piecewise linear VTLN warping function (float, default = 100) + --vtln-map : Map from utterance or speaker-id to vtln warp factor (rspecifier) (string, default = "") + --vtln-warp : Vtln warp factor (only applicable if vtln-map not specified) (float, default = 1) + --window-type : Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") + --write-utt2dur : Wspecifier to write duration of each utterance in seconds, e.g. 'ark,t:utt2dur'. (string, default = "") + +Standard options: + --config : Configuration file to read (this option may be repeated) (string, default = "") + --help : Print out usage message (bool, default = false) + --print-args : Print the command line arguments (to stderr) (bool, default = true) + --verbose : Verbose level (higher->more logging) (int, default = 0) + diff --git a/doc/source/usage/code/fbank_options-1.txt b/doc/source/usage/code/fbank_options-1.txt new file mode 100644 index 0000000..7e0470a --- /dev/null +++ b/doc/source/usage/code/fbank_options-1.txt @@ -0,0 +1,65 @@ +$ python3 +Python 3.8.0 (default, Oct 28 2019, 16:14:01) +[GCC 8.3.0] on linux +Type "help", "copyright", "credits" or "license" for more information. +>>> import kaldifeat +>>> opts = kaldifeat.FbankOptions() +>>> print(opts) +frame_opts: +samp_freq: 16000 +frame_shift_ms: 10 +frame_length_ms: 25 +dither: 1 +preemph_coeff: 0.97 +remove_dc_offset: 1 +window_type: povey +round_to_power_of_two: 1 +blackman_coeff: 0.42 +snip_edges: 1 +max_feature_vectors: -1 + + +mel_opts: +num_bins: 23 +low_freq: 20 +high_freq: 0 +vtln_low: 100 +vtln_high: -500 +debug_mel: 0 +htk_mode: 0 + +use_energy: 0 +energy_floor: 0 +raw_energy: 1 +htk_compat: 0 +use_log_fbank: 1 +use_power: 1 +device: cpu + +>>> print(opts.dither) +Traceback (most recent call last): + File "", line 1, in +AttributeError: '_kaldifeat.FbankOptions' object has no attribute 'dither' +>>> +>>> print(opts.frame_opts.dither) +1.0 +>>> opts.frame_opts.dither = 0 # disable dither +>>> print(opts.frame_opts.dither) +0.0 +>>> import torch +>>> print(opts.device) +cpu +>>> opts.device = 'cuda:0' +>>> print(opts.device) +cuda:0 +>>> opts.device = torch.device('cuda', 1) +>>> print(opts.device) +cuda:1 +>>> opts.device = 'cpu' +>>> print(opts.device) +cpu +>>> print(opts.mel_opts.num_bins) +23 +>>> opts.mel_opts.num_bins = 80 +>>> print(opts.mel_opts.num_bins) +80 diff --git a/doc/source/usage/code/test_fbank_options.py b/doc/source/usage/code/test_fbank_options.py new file mode 120000 index 0000000..3bfe0fa --- /dev/null +++ b/doc/source/usage/code/test_fbank_options.py @@ -0,0 +1 @@ +../../../../kaldifeat/python/tests/test_fbank_options.py \ No newline at end of file diff --git a/doc/source/usage/fbank.rst b/doc/source/usage/fbank.rst new file mode 100644 index 0000000..e3f1351 --- /dev/null +++ b/doc/source/usage/fbank.rst @@ -0,0 +1,3 @@ +kaldifeat.Fbank +=============== + diff --git a/doc/source/usage/fbank_options.rst b/doc/source/usage/fbank_options.rst new file mode 100644 index 0000000..d9adc2d --- /dev/null +++ b/doc/source/usage/fbank_options.rst @@ -0,0 +1,52 @@ +kaldifeat.FbankOptions +====================== + +If you want to construct an instance of `kaldifeat.Fbank`_ or +`kaldifeat.OnlineFbank`_, you have to provide an instance of +`kaldifeat.FbankOptions`_. + +The following code shows how to construct an instance of `kaldifeat.FbankOptions`_. + +.. literalinclude:: ./code/fbank_options-1.txt + :caption: Usage of `kaldifeat.FbankOptions`_ + :emphasize-lines: 6,8,22,37 + :language: python + +Note that we reuse the same option name with `compute-fbank-feats`_ from `Kaldi`_: + +.. code-block:: bash + + $ compute-fbank-feats --help + + +.. literalinclude:: ./code/compute-fbank-feats-help.txt + :caption: Output of ``compute-fbank-feats --help`` + +Please refer to the output of ``compute-fbank-feats --help`` for the meaning +of each field of `kaldifeat.FbankOptions`_. + +One thing worth noting is that `kaldifeat.FbankOptions`_ has a field ``device``, +which is an instance of ``torch.device``. You can assign it either a string, e.g., +``"cpu"`` or ``"cuda:0"``, or an instance of ``torch.device``, e.g., ``torch.device("cpu")`` or +``torch.device("cuda", 1)``. + +.. hint:: + + You can use this field to control whether the feature computer + constructed from it performs computation on CPU or CUDA. + +.. caution:: + + If you use a CUDA device, make sure that you have installed a CUDA version + of `PyTorch`_. + +Example usage +------------- + +The following code from +``_ +demonstrate the usage of `kaldifeat.FbankOptions`_: + +.. literalinclude:: ./code/test_fbank_options.py + :caption: Example usage of `kaldifeat.FbankOptions`_ + :language: python diff --git a/doc/source/usage/index.rst b/doc/source/usage/index.rst new file mode 100644 index 0000000..f40dcd5 --- /dev/null +++ b/doc/source/usage/index.rst @@ -0,0 +1,11 @@ +Usage +===== + +This section describes how to use feature computers in `kaldifeat`_. + +.. toctree:: + :maxdepth: 2 + + fbank_options + fbank + online_fbank diff --git a/doc/source/usage/online_fbank.rst b/doc/source/usage/online_fbank.rst new file mode 100644 index 0000000..557104d --- /dev/null +++ b/doc/source/usage/online_fbank.rst @@ -0,0 +1,3 @@ +kaldifeat.OnlineFbank +===================== +