diff --git a/.github/workflows/build-doc.yml b/.github/workflows/build-doc.yml
new file mode 100644
index 0000000..869b5c4
--- /dev/null
+++ b/.github/workflows/build-doc.yml
@@ -0,0 +1,62 @@
+# Copyright      2022  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# refer to https://github.com/actions/starter-workflows/pull/47/files
+
+# You can access it at https://csukuangfj.github.io/kaldifeat
+name: Generate doc
+on:
+  push:
+    branches:
+    - master
+    - doc
+
+jobs:
+  build-doc:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8]
+    steps:
+      # refer to https://github.com/actions/checkout
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      - name: Build doc
+        shell: bash
+        run: |
+          cd doc
+          python3 -m pip install -r ./requirements.txt
+          make html
+          touch build/html/.nojekyll
+
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./doc/build/html
+          publish_branch: gh-pages
diff --git a/.github/workflows/build_conda_ubuntu_cpu.yml b/.github/workflows/build_conda_ubuntu_cpu.yml
new file mode 100644
index 0000000..3f0692c
--- /dev/null
+++ b/.github/workflows/build_conda_ubuntu_cpu.yml
@@ -0,0 +1,108 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# refer to https://github.com/actions/starter-workflows/pull/47/files
+
+name: build_conda_ubuntu_cpu
+
+on:
+  push:
+    tags:
+      - '*'
+
+env:
+  KALDIFEAT_BUILD_TYPE: Release
+
+jobs:
+  generate_build_matrix:
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  build_conda_ubuntu_cpu:
+    needs: generate_build_matrix
+    runs-on: ubuntu-18.04
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      # refer to https://github.com/actions/checkout
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          python-version: ${{ matrix.python-version }}
+          activate-environment: kaldifeat
+
+      - name: Display Python version
+        shell: bash -l {0}
+        run: |
+          python3 -c "import sys; print(sys.version)"
+          which python3
+
+      - name: Install conda dependencies
+        shell: bash -l {0}
+        run: |
+          conda install -y -q anaconda-client
+          conda install -y -q conda-build
+          conda install -y -q -c pytorch pytorch=${{ matrix.torch }} cpuonly
+
+      - name: Display conda info
+        shell: bash -l {0}
+        run: |
+          which conda
+          conda env list
+          conda info
+          nproc
+
+      - name: Build kaldifeat
+        shell: bash -l {0}
+        env:
+          KALDIFEAT_PYTHON_VERSION: ${{ matrix.python-version}}
+          KALDIFEAT_TORCH_VERSION: ${{ matrix.torch }}
+          KALDIFEAT_CONDA_TOKEN: ${{ secrets.KALDIFEAT_CONDA_TOKEN}}
+          KALDIFEAT_IS_GITHUB_ACTIONS: 1
+          KALDIFEAT_IS_FOR_CONDA: 1
+        run: |
+          export KALDIFEAT_BUILD_TYPE=$KALDIFEAT_BUILD_TYPE
+          ./scripts/build_conda_cpu.sh
+
+      - name: Display generated files
+        run: |
+          ls -lh /usr/share/miniconda/envs/kaldifeat/conda-bld/linux-64
+
+      - name: Upload generated files
+        uses: actions/upload-artifact@v2
+        with:
+          name: cpu-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-${{ matrix.os }}
+          path: /usr/share/miniconda/envs/kaldifeat/conda-bld/linux-64/*.tar.bz2
diff --git a/.github/workflows/build_conda.yml b/.github/workflows/build_conda_ubuntu_cuda.yml
similarity index 54%
rename from .github/workflows/build_conda.yml
rename to .github/workflows/build_conda_ubuntu_cuda.yml
index 124e2db..b604a62 100644
--- a/.github/workflows/build_conda.yml
+++ b/.github/workflows/build_conda_ubuntu_cuda.yml
@@ -16,7 +16,7 @@
 
 # refer to https://github.com/actions/starter-workflows/pull/47/files
 
-name: build_conda_cuda
+name: build_conda_ubuntu_cuda
 
 on:
   push:
@@ -27,70 +27,30 @@ env:
   KALDIFEAT_BUILD_TYPE: Release
 
 jobs:
-  build_conda_cuda:
-    runs-on: ${{ matrix.os }}
+  generate_build_matrix:
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --enable-cuda
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --enable-cuda)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  build_conda_ubuntu_cuda:
+    needs: generate_build_matrix
+    runs-on: ubuntu-18.04
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-18.04]
-        # anaconda does not support 3.9 as of 2021.05.08
-        python-version: [3.6, 3.7, 3.8, 3.9]
-        # python-version: [3.6, 3.7, 3.8]
-        cuda: ["10.1", "10.2", "11.0", "11.1"]
-        # from https://download.pytorch.org/whl/torch_stable.html
-        #
-        # PyTorch 1.9.0 supports: 10.2 (default), 11.1
-        # PyTorch 1.8.1 supports: cuda 10.1, 10.2 (default), 11.1
-        # PyTorch 1.8.0 supports: cuda 10.1, 10.2 (default), 11.1
-        # PyTorch 1.7.x supports: cuda 10.1, 10.2 (default), 11.0, 9.2 (not included in this setup)
-        # PyTorch 1.6.0 supports: cuda 10.1, 10.2 (default), 9.2 (not included in this setup)
-        # PyTorch 1.5.x supports: cuda 10.1, 10.2 (default), 9.2 (not included in this setup)
-        #
-        # PyTorch 1.8.x and 1.7.1 support 3.6, 3.7, 3.8, 3.9
-        # PyTorch 1.7.0, 1.6.0, and 1.5.x support 3.6, 3.7, 3.8
-        #
-        # Other PyTorch versions are not tested
-        #
-        # torch: ["1.5.0", "1.5.1", "1.6.0", "1.7.0", "1.7.1", "1.8.0", "1.8.1"]
-        # 1.5.x is removed because there are compilation errors.
-        #  See
-        #  https://github.com/csukuangfj/k2/runs/2533830771?check_suite_focus=true
-        #  and
-        #  https://github.com/NVIDIA/apex/issues/805
-        torch: ["1.6.0", "1.7.0", "1.7.1", "1.8.0", "1.8.1", "1.9.0"]
-        exclude:
-          # - cuda: "11.0" # exclude 11.0 for [1.5.0, 1.5.1, 1.6.0, 1.8.0, 1.8.1, 1.9.0]
-          #   torch: "1.5.0"
-          # - cuda: "11.0"
-          #   torch: "1.5.1"
-          - cuda: "11.0"
-            torch: "1.6.0"
-          - cuda: "11.0"
-            torch: "1.8.0"
-          - cuda: "11.0"
-            torch: "1.8.1"
-          - cuda: "11.0"
-            torch: "1.9.0"
-          # - cuda: "11.1" # exclude 11.1 for [1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1]
-          #   torch: "1.5.0"
-          # - cuda: "11.1"
-          #   torch: "1.5.1"
-          - cuda: "11.1"
-            torch: "1.6.0"
-          - cuda: "11.1"
-            torch: "1.7.0"
-          - cuda: "11.1"
-            torch: "1.7.1"
-          - cuda: "10.1" # exclude 10.1 for [1.9.0]
-            torch: "1.9.0"
-          - python-version: 3.9 # exclude Python 3.9 for [1.5.0, 1.5.1, 1.6.0, 1.7.0]
-            torch: "1.5.0"
-          - python-version: 3.9
-            torch: "1.5.1"
-          - python-version: 3.9
-            torch: "1.6.0"
-          - python-version: 3.9
-            torch: "1.7.0"
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
 
     steps:
       # refer to https://github.com/actions/checkout
@@ -131,7 +91,6 @@ jobs:
         run: |
           conda install -y -q anaconda-client
           conda install -y -q conda-build
-          conda install -y -q bs4 requests tqdm
           conda install -y -q -c pytorch -c conda-forge pytorch=${{ matrix.torch }} cudatoolkit=${{ matrix.cuda }}
 
       - name: Display conda info
@@ -161,3 +120,13 @@ jobs:
         run: |
           export KALDIFEAT_BUILD_TYPE=$KALDIFEAT_BUILD_TYPE
           ./scripts/build_conda.sh
+
+      - name: Display generated files
+        run: |
+          ls -lh /usr/share/miniconda/envs/kaldifeat/conda-bld/linux-64
+
+      - name: Upload generated files
+        uses: actions/upload-artifact@v2
+        with:
+          name: cuda-${{ matrix.cuda }}-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-${{ matrix.os }}
+          path: /usr/share/miniconda/envs/kaldifeat/conda-bld/linux-64/*.tar.bz2
diff --git a/.github/workflows/nightly-tests-macos-cpu.yml b/.github/workflows/nightly-tests-macos-cpu.yml
new file mode 100644
index 0000000..4814460
--- /dev/null
+++ b/.github/workflows/nightly-tests-macos-cpu.yml
@@ -0,0 +1,116 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Nightly tests macos cpu
+
+on:
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 23:50 UTC time every day
+    - cron: "50 23 * * *"
+
+jobs:
+  enable_nightly_build:
+    runs-on: ubuntu-latest
+    outputs:
+      enabled: ${{ steps.set-enabled.outputs.enabled }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set enabled
+        id: set-enabled
+        run: |
+          enabled=$(python scripts/github_actions/run-nightly-build.py)
+          echo "enabled: $enabled"
+          echo "::set-output name=enabled::${enabled}"
+
+  generate_build_matrix:
+    needs: enable_nightly_build
+    if: needs.enable_nightly_build.outputs.enabled == 'true'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_nightly_tests_macos_cpu:
+    needs: generate_build_matrix
+    runs-on: macos-10.15
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      - name: Display clang version
+        run: |
+          clang --version
+
+      - name: Install PyTorch ${{ matrix.torch }}
+        shell: bash
+        run: |
+          python3 -m pip install -qq --upgrade pip
+          python3 -m pip install -qq wheel twine numpy typing_extensions
+          python3 -m pip install -qq torch==${{ matrix.torch }} numpy
+          python3 -m pip install -qq dataclasses soundfile
+
+          python3 -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Install from PyPI
+        shell: bash
+        run: |
+          export KALDIFEAT_MAKE_ARGS="-j 3"
+          pip install --verbose kaldifeat
+
+      - name: Run tests
+        shell: bash
+        run: |
+          lib_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
+          echo "lib_path: $lib_path"
+          ls -lh $lib_path
+          # export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH
+          otool -L $lib_path/_kaldifeat.*.so
+          otool -l $lib_path/_kaldifeat.*.so
+
+          cd kaldifeat/python/tests
+          make test
diff --git a/.github/workflows/nightly-tests-ubuntu-conda-cpu.yml b/.github/workflows/nightly-tests-ubuntu-conda-cpu.yml
new file mode 100644
index 0000000..317901a
--- /dev/null
+++ b/.github/workflows/nightly-tests-ubuntu-conda-cpu.yml
@@ -0,0 +1,100 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Nightly tests ubuntu conda cpu
+
+on:
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 23:50 UTC time every day
+    - cron: "50 23 * * *"
+
+jobs:
+  enable_nightly_build:
+    runs-on: ubuntu-latest
+    outputs:
+      enabled: ${{ steps.set-enabled.outputs.enabled }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set enabled
+        id: set-enabled
+        run: |
+          enabled=$(python scripts/github_actions/run-nightly-build.py)
+          echo "enabled: $enabled"
+          echo "::set-output name=enabled::${enabled}"
+
+  generate_build_matrix:
+    needs: enable_nightly_build
+    if: needs.enable_nightly_build.outputs.enabled == 'true'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_nightly_tests_ubuntu_conda_cpu:
+    needs: generate_build_matrix
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          python-version: ${{ matrix.python-version }}
+          activate-environment: kaldifeat
+
+      - name: Display Python version
+        shell: bash -l {0}
+        run: |
+          python3 -c "import sys; print(sys.version)"
+          which python3
+
+      - name: Install kaldifeat and run tests
+        shell: bash -l {0}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libsndfile1-dev libsndfile1 ffmpeg
+
+          conda install -y -c kaldifeat -c pytorch cpuonly pytorch=${{ matrix.torch }} kaldifeat python=${{ matrix.python-version }}
+          conda install -y -c conda-forge pysoundfile
+
+
+          cd kaldifeat/python/tests
+          make test
diff --git a/.github/workflows/nightly-tests-ubuntu-conda-cuda.yml b/.github/workflows/nightly-tests-ubuntu-conda-cuda.yml
new file mode 100644
index 0000000..9ede567
--- /dev/null
+++ b/.github/workflows/nightly-tests-ubuntu-conda-cuda.yml
@@ -0,0 +1,96 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Nightly tests ubuntu conda cuda
+
+on:
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 23:50 UTC time every day
+    - cron: "50 23 * * *"
+
+jobs:
+  enable_nightly_build:
+    runs-on: ubuntu-latest
+    outputs:
+      enabled: ${{ steps.set-enabled.outputs.enabled }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set enabled
+        id: set-enabled
+        run: |
+          enabled=$(python scripts/github_actions/run-nightly-build.py)
+          echo "enabled: $enabled"
+          echo "::set-output name=enabled::${enabled}"
+
+  generate_build_matrix:
+    needs: enable_nightly_build
+    if: needs.enable_nightly_build.outputs.enabled == 'true'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --enable-cuda --test-only-latest-torch
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --enable-cuda --test-only-latest-torch)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_nightly_tests_ubuntu_conda_cuda:
+    needs: generate_build_matrix
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          python-version: ${{ matrix.python-version }}
+          activate-environment: kaldifeat
+
+      - name: Display Python version
+        shell: bash -l {0}
+        run: |
+          python3 -c "import sys; print(sys.version)"
+          which python3
+
+      - name: Install kaldifeat and run tests
+        shell: bash -l {0}
+        run: |
+          conda install -y -c kaldifeat -c pytorch -c conda-forge pytorch=${{ matrix.torch }} cudatoolkit=${{ matrix.cuda }} kaldifeat python=${{ matrix.python-version }}
+          conda install -y -c conda-forge pysoundfile
+
+          cd kaldifeat/python/tests
+          make test
diff --git a/.github/workflows/nightly-tests-ubuntu-pip-cpu.yml b/.github/workflows/nightly-tests-ubuntu-pip-cpu.yml
new file mode 100644
index 0000000..e0284c0
--- /dev/null
+++ b/.github/workflows/nightly-tests-ubuntu-pip-cpu.yml
@@ -0,0 +1,118 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Nightly tests ubuntu pip cpu
+
+on:
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 23:50 UTC time every day
+    - cron: "50 23 * * *"
+
+jobs:
+  enable_nightly_build:
+    runs-on: ubuntu-latest
+    outputs:
+      enabled: ${{ steps.set-enabled.outputs.enabled }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set enabled
+        id: set-enabled
+        run: |
+          enabled=$(python scripts/github_actions/run-nightly-build.py)
+          echo "enabled: $enabled"
+          echo "::set-output name=enabled::${enabled}"
+
+  generate_build_matrix:
+    needs: enable_nightly_build
+    if: needs.enable_nightly_build.outputs.enabled == 'true'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_nightly_tests_ubuntu_pip_cpu:
+    needs: generate_build_matrix
+    runs-on: ubuntu-18.04
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      - name: Install GCC 7
+        run: |
+          sudo apt-get install -y gcc-7 g++-7
+          echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV
+          echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV
+
+      - name: Display GCC version
+        run: |
+          gcc --version
+
+      - name: Install PyTorch ${{ matrix.torch }}
+        shell: bash
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libsndfile1-dev libsndfile1 ffmpeg
+
+          python3 -m pip install -qq --upgrade pip
+          python3 -m pip install -qq wheel twine typing_extensions numpy soundfile
+          python3 -m pip install -qq torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+          python3 -m pip install -qq dataclasses
+
+          python3 -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Install from PyPI
+        shell: bash
+        run: |
+          export KALDIFEAT_MAKE_ARGS="-j 3"
+          pip install --verbose kaldifeat
+
+      - name: Run tests
+        shell: bash
+        run: |
+          cd kaldifeat/python/tests
+          make test
diff --git a/.github/workflows/publish_to_pypi.yml-bak b/.github/workflows/nightly-tests-ubuntu-pip-cuda.yml
similarity index 53%
rename from .github/workflows/publish_to_pypi.yml-bak
rename to .github/workflows/nightly-tests-ubuntu-pip-cuda.yml
index e0d7b1f..be85563 100644
--- a/.github/workflows/publish_to_pypi.yml-bak
+++ b/.github/workflows/nightly-tests-ubuntu-pip-cuda.yml
@@ -14,23 +14,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-name: Publish to PyPI
+name: Nightly tests ubuntu pip cuda
 
 on:
-  push:
-    tags:
-      - '*'
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 23:50 UTC time every day
+    - cron: "50 23 * * *"
 
 jobs:
-  pypi:
-    runs-on: ${{ matrix.os }}
+  enable_nightly_build:
+    runs-on: ubuntu-latest
+    outputs:
+      enabled: ${{ steps.set-enabled.outputs.enabled }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set enabled
+        id: set-enabled
+        run: |
+          enabled=$(python scripts/github_actions/run-nightly-build.py)
+          echo "enabled: $enabled"
+          echo "::set-output name=enabled::${enabled}"
+
+  generate_build_matrix:
+    needs: enable_nightly_build
+    if: needs.enable_nightly_build.outputs.enabled == 'true'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --enable-cuda
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --enable-cuda)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_nightly_tests_ubuntu_pip_cuda:
+    needs: generate_build_matrix
+    runs-on: ubuntu-18.04
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-18.04, macos-10.15]
-        cuda: ["10.1"]
-        torch: ["1.7.1"]
-        python-version: [3.6, 3.7, 3.8]
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
     steps:
       - uses: actions/checkout@v2
         with:
@@ -42,15 +80,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install GCC 7
-        if: startsWith(matrix.os, 'ubuntu')
         run: |
           sudo apt-get install -y gcc-7 g++-7
           echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV
           echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV
 
-
       - name: Install CUDA Toolkit ${{ matrix.cuda }}
-        if: startsWith(matrix.os, 'ubuntu')
         shell: bash
         env:
           cuda: ${{ matrix.cuda }}
@@ -61,61 +96,39 @@ jobs:
           echo "LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
 
       - name: Display NVCC version
-        if: startsWith(matrix.os, 'ubuntu')
         run: |
           which nvcc
           nvcc --version
 
       - name: Install PyTorch ${{ matrix.torch }}
-        if: startsWith(matrix.os, 'ubuntu')
         env:
           cuda: ${{ matrix.cuda }}
           torch: ${{ matrix.torch }}
         shell: bash
         run: |
+          sudo apt-get update
+          sudo apt-get install -y libsndfile1-dev libsndfile1 ffmpeg
           python3 -m pip install --upgrade pip
-          python3 -m pip install wheel twine typing_extensions
-          python3 -m pip install bs4 requests tqdm
+          python3 -m pip install wheel twine typing_extensions soundfile
+          python3 -m pip install bs4 requests tqdm numpy
 
           ./scripts/github_actions/install_torch.sh
           python3 -c "import torch; print('torch version:', torch.__version__)"
 
-      - name: Install PyTorch ${{ matrix.torch }}
-        if: startsWith(matrix.os, 'macos')
-        shell: bash
-        run: |
-          python3 -m pip install -qq --upgrade pip
-          python3 -m pip install -qq wheel twine typing_extensions
-          python3 -m pip install -qq torch==${{ matrix.torch }}
-
-          python3 -c "import torch; print('torch version:', torch.__version__)"
-
       - name: Download cudnn 8.0
-        if: startsWith(matrix.os, 'ubuntu')
         env:
           cuda: ${{ matrix.cuda }}
         run: |
           ./scripts/github_actions/install_cudnn.sh
 
-      - name: Build pip packages
+      - name: Install from PyPI
         shell: bash
-        env:
-          KALDIFEAT_IS_FOR_PYPI: 1
         run: |
-          tag=$(python3 -c "import sys; print(''.join(sys.version[:3].split('.')))")
-          export KALDIFEAT_MAKE_ARGS="-j2"
-          python3 setup.py bdist_wheel --python-tag=py${tag}
-          ls -lh dist/
+          export KALDIFEAT_MAKE_ARGS="-j 3"
+          pip install --verbose kaldifeat
 
-      - name: Publish wheels to PyPI
-        env:
-          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      - name: Run tests
+        shell: bash
         run: |
-          twine upload dist/kaldifeat-*.whl
-
-      - name: Upload Wheel
-        uses: actions/upload-artifact@v2
-        with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-${{ matrix.os }}
-          path: dist/*.whl
+          cd kaldifeat/python/tests
+          make test
diff --git a/.github/workflows/nightly-tests-windows-pip-cpu.yml b/.github/workflows/nightly-tests-windows-pip-cpu.yml
new file mode 100644
index 0000000..44cc23f
--- /dev/null
+++ b/.github/workflows/nightly-tests-windows-pip-cpu.yml
@@ -0,0 +1,119 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Nightly tests windows pip cpu
+
+on:
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 23:50 UTC time every day
+    - cron: "50 23 * * *"
+
+jobs:
+  enable_nightly_build:
+    runs-on: ubuntu-latest
+    outputs:
+      enabled: ${{ steps.set-enabled.outputs.enabled }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set enabled
+        id: set-enabled
+        run: |
+          enabled=$(python scripts/github_actions/run-nightly-build.py)
+          echo "enabled: $enabled"
+          echo "::set-output name=enabled::${enabled}"
+
+  generate_build_matrix:
+    needs: enable_nightly_build
+    if: needs.enable_nightly_build.outputs.enabled == 'true'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_nightly_tests_windows_pip_cpu:
+    needs: generate_build_matrix
+    runs-on: windows-2019
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      # see https://github.com/microsoft/setup-msbuild
+      - name: Add msbuild to PATH
+        uses: microsoft/setup-msbuild@v1.0.2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      - name: Install PyTorch ${{ matrix.torch }}
+        shell: bash
+        run: |
+          python3 -m pip install -qq --upgrade pip
+          python3 -m pip install -qq wheel twine numpy typing_extensions
+          python3 -m pip install -qq dataclasses soundfile numpy
+          python3 -m pip install -qq torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+          python3 -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Install from PyPI
+        shell: bash
+        run: |
+          export KALDIFEAT_MAKE_ARGS="-j 3"
+          pip install --verbose kaldifeat
+
+      - name: Run tests
+        shell: bash
+        run: |
+          cd kaldifeat/python/tests
+
+          python3 ./test_fbank.py
+          python3 ./test_fbank_options.py
+          python3 ./test_frame_extraction_options.py
+          python3 ./test_mel_bank_options.py
+          python3 ./test_mfcc.py
+          python3 ./test_mfcc_options.py
+          python3 ./test_plp.py
+          python3 ./test_plp_options.py
+          python3 ./test_spectrogram.py
+          python3 ./test_spectrogram_options.py
diff --git a/.github/workflows/nightly-tests-windows-pip-cuda.yml b/.github/workflows/nightly-tests-windows-pip-cuda.yml
new file mode 100644
index 0000000..29bb230
--- /dev/null
+++ b/.github/workflows/nightly-tests-windows-pip-cuda.yml
@@ -0,0 +1,160 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Nightly tests windows pip cuda
+
+on:
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 23:50 UTC time every day
+    - cron: "50 23 * * *"
+
+jobs:
+  enable_nightly_build:
+    runs-on: ubuntu-latest
+    outputs:
+      enabled: ${{ steps.set-enabled.outputs.enabled }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set enabled
+        id: set-enabled
+        run: |
+          enabled=$(python scripts/github_actions/run-nightly-build.py)
+          echo "enabled: $enabled"
+          echo "::set-output name=enabled::${enabled}"
+
+  generate_build_matrix:
+    needs: enable_nightly_build
+    if: needs.enable_nightly_build.outputs.enabled == 'true'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --enable-cuda --for-windows
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --enable-cuda --for-windows)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_nightly_tests_windows_pip_cuda:
+    needs: generate_build_matrix
+    runs-on: windows-2019
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      # see https://github.com/microsoft/setup-msbuild
+      - name: Add msbuild to PATH
+        uses: microsoft/setup-msbuild@v1.0.2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      # See https://github.com/Jimver/cuda-toolkit/blob/master/src/links/windows-links.ts
+      # for available CUDA versions
+      - uses: Jimver/cuda-toolkit@v0.2.7
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda }}
+
+      - name: Display CUDA version
+        run: |
+          echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          nvcc --version
+
+      - name: Remove CUDA installation package
+        shell: bash
+        run: |
+          rm "C:/hostedtoolcache/windows/cuda_installer-windows/${{ matrix.cuda }}/x64/cuda_installer_${{ matrix.cuda }}.exe"
+
+      - name: Download cuDNN
+        shell: bash
+        run: |
+          GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/cudnn-for-windows
+          cd cudnn-for-windows
+          git lfs pull --include="cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip"
+          unzip cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip
+          rm cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip
+          ls -lh *
+          ls -lh */*
+
+          echo "PWD: $PWD"
+
+      - name: Install PyTorch ${{ matrix.torch }}
+        shell: bash
+        run: |
+          version=${{ matrix.cuda }}
+          major=${version:0:2}
+          minor=${version:3:1}
+          v=${major}${minor}
+          if [ ${v} -eq 102 ]; then v=""; else v="+cu${v}"; fi
+
+          python3 -m pip install -qq --upgrade pip
+          python3 -m pip install -qq wheel twine numpy typing_extensions
+          python3 -m pip install -qq dataclasses soundfile numpy
+          python3 -m pip install -qq torch==${{ matrix.torch }}${v} -f https://download.pytorch.org/whl/torch_stable.html numpy
+
+          python3 -c "import torch; print('torch version:', torch.__version__)"
+
+          python3 -m torch.utils.collect_env
+
+      - name: Install from PyPI
+        shell: bash
+        run: |
+          echo "PWD: $PWD"
+          export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCUDNN_INCLUDE_PATH=d:/a/kaldifeat/kaldifeat/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/include -DCUDNN_LIBRARY_PATH=d:/a/kaldifeat/kaldifeat/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/lib/cudnn.lib"
+          export KALDIFEAT_MAKE_ARGS="-j 3"
+          pip install --verbose kaldifeat
+
+      - name: Run tests
+        shell: bash
+        run: |
+          cd kaldifeat/python/tests
+
+          python3 ./test_fbank.py
+          python3 ./test_fbank_options.py
+          python3 ./test_frame_extraction_options.py
+          python3 ./test_mel_bank_options.py
+          python3 ./test_mfcc.py
+          python3 ./test_mfcc_options.py
+          python3 ./test_plp.py
+          python3 ./test_plp_options.py
+          python3 ./test_spectrogram.py
+          python3 ./test_spectrogram_options.py
diff --git a/.github/workflows/run-tests-macos-cpu.yml b/.github/workflows/run-tests-macos-cpu.yml
new file mode 100644
index 0000000..6707c7e
--- /dev/null
+++ b/.github/workflows/run-tests-macos-cpu.yml
@@ -0,0 +1,85 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Run tests macos cpu
+
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  generate_build_matrix:
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_tests_macos_cpu:
+    needs: generate_build_matrix
+    runs-on: macos-10.15
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install PyTorch ${{ matrix.torch }}
+        shell: bash
+        run: |
+          python3 -m pip install -qq --upgrade pip
+          python3 -m pip install -qq wheel twine typing_extensions soundfile numpy
+          python3 -m pip install -qq torch==${{ matrix.torch }} -f https://download.pytorch.org/whl/torch_stable.html
+
+          python3 -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Build
+        shell: bash
+        run: |
+          mkdir build_release
+          cd build_release
+          cmake ..
+          make VERBOSE=1 -j3
+
+      - name: Run tests
+        shell: bash
+        run: |
+          cd build_release
+          ctest --output-on-failure
diff --git a/.github/workflows/run-tests-ubuntu-cpu.yml b/.github/workflows/run-tests-ubuntu-cpu.yml
new file mode 100644
index 0000000..4337cce
--- /dev/null
+++ b/.github/workflows/run-tests-ubuntu-cpu.yml
@@ -0,0 +1,94 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Run tests ubuntu cpu
+
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  generate_build_matrix:
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_tests_ubuntu_cpu:
+    needs: generate_build_matrix
+    runs-on: ubuntu-18.04
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install GCC 7
+        run: |
+          sudo apt-get install -y gcc-7 g++-7
+          echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV
+          echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV
+
+      - name: Install PyTorch ${{ matrix.torch }}
+        shell: bash
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libsndfile1-dev libsndfile1 ffmpeg
+          python3 -m pip install --upgrade pip
+          python3 -m pip install wheel twine typing_extensions soundfile
+          python3 -m pip install bs4 requests tqdm numpy
+          python3 -m pip install -qq torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+          python3 -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Build
+        shell: bash
+        run: |
+          mkdir build_release
+          cd build_release
+          cmake ..
+          make VERBOSE=1 -j3
+
+      - name: Run tests
+        shell: bash
+        run: |
+          cd build_release
+          ctest --output-on-failure
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests-ubuntu-cuda.yml
similarity index 71%
rename from .github/workflows/run-tests.yml
rename to .github/workflows/run-tests-ubuntu-cuda.yml
index 656a792..51251b5 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests-ubuntu-cuda.yml
@@ -14,26 +14,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-name: Run tests
+name: Run tests ubuntu cuda
 
 on:
   push:
     branches:
       - master
+
   pull_request:
     branches:
       - master
 
 jobs:
-  run_tests:
-    runs-on: ${{ matrix.os }}
+  generate_build_matrix:
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --enable-cuda --test-only-latest-torch
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --enable-cuda --test-only-latest-torch)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_tests_ubuntu_cuda:
+    needs: generate_build_matrix
+    runs-on: ubuntu-18.04
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-18.04, macos-10.15]
-        cuda: ["10.1"]
-        torch: ["1.7.1"]
-        python-version: [3.6, 3.7, 3.8]
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
     steps:
       - uses: actions/checkout@v2
         with:
@@ -53,7 +70,6 @@ jobs:
 
 
       - name: Install CUDA Toolkit ${{ matrix.cuda }}
-        if: startsWith(matrix.os, 'ubuntu')
         shell: bash
         env:
           cuda: ${{ matrix.cuda }}
@@ -64,39 +80,26 @@ jobs:
           echo "LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
 
       - name: Display NVCC version
-        if: startsWith(matrix.os, 'ubuntu')
         run: |
           which nvcc
           nvcc --version
 
       - name: Install PyTorch ${{ matrix.torch }}
-        if: startsWith(matrix.os, 'ubuntu')
         env:
           cuda: ${{ matrix.cuda }}
           torch: ${{ matrix.torch }}
         shell: bash
         run: |
-          sudo apt update
-          sudo apt install libsndfile1-dev libsndfile1 ffmpeg
+          sudo apt-get update
+          sudo apt-get install -y libsndfile1-dev libsndfile1 ffmpeg
           python3 -m pip install --upgrade pip
           python3 -m pip install wheel twine typing_extensions soundfile
-          python3 -m pip install bs4 requests tqdm
+          python3 -m pip install bs4 requests tqdm numpy
 
           ./scripts/github_actions/install_torch.sh
           python3 -c "import torch; print('torch version:', torch.__version__)"
 
-      - name: Install PyTorch ${{ matrix.torch }}
-        if: startsWith(matrix.os, 'macos')
-        shell: bash
-        run: |
-          python3 -m pip install -qq --upgrade pip
-          python3 -m pip install -qq wheel twine typing_extensions soundfile
-          python3 -m pip install -qq torch==${{ matrix.torch }}
-
-          python3 -c "import torch; print('torch version:', torch.__version__)"
-
       - name: Download cudnn 8.0
-        if: startsWith(matrix.os, 'ubuntu')
         env:
           cuda: ${{ matrix.cuda }}
         run: |
@@ -108,10 +111,10 @@ jobs:
           mkdir build_release
           cd build_release
           cmake ..
-          make VERBOSE=1
+          make VERBOSE=1 -j3
 
       - name: Run tests
         shell: bash
         run: |
           cd build_release
-          ctest -R py --output-on-failure
+          ctest --output-on-failure
diff --git a/.github/workflows/build_windows.yml b/.github/workflows/run-tests-windows-cpu.yml
similarity index 59%
rename from .github/workflows/build_windows.yml
rename to .github/workflows/run-tests-windows-cpu.yml
index ea8ae90..f735ac3 100644
--- a/.github/workflows/build_windows.yml
+++ b/.github/workflows/run-tests-windows-cpu.yml
@@ -1,23 +1,56 @@
-name: build
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: Run tests windows cpu
 
 on:
   push:
     branches:
       - master
+
   pull_request:
     branches:
       - master
 
 jobs:
-  build-windows:
+  generate_build_matrix:
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --test-only-latest-torch)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_tests_windows_cpu:
     # see https://github.com/actions/virtual-environments/blob/win19/20210525.0/images/win/Windows2019-Readme.md
-    runs-on: ${{ matrix.os }}
+    needs: generate_build_matrix
+    runs-on: windows-2019
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-2019]
-        torch: ["1.6.0", "1.7.0", "1.7.1", "1.8.0", "1.8.1", "1.9.0", "1.10.0", "1.11.0"]
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
 
     steps:
       - uses: actions/checkout@v2
diff --git a/.github/workflows/run-tests-windows-cuda.yml b/.github/workflows/run-tests-windows-cuda.yml
new file mode 100644
index 0000000..6d04dd5
--- /dev/null
+++ b/.github/workflows/run-tests-windows-cuda.yml
@@ -0,0 +1,173 @@
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+name: Run tests windows cuda
+
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  generate_build_matrix:
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python scripts/github_actions/generate_build_matrix.py --enable-cuda --for-windows --test-only-latest-torch
+          MATRIX=$(python scripts/github_actions/generate_build_matrix.py --enable-cuda --for-windows --test-only-latest-torch)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  run_tests_windows_cuda:
+    needs: generate_build_matrix
+    runs-on: windows-2019
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      # see https://github.com/microsoft/setup-msbuild
+      - name: Add msbuild to PATH
+        uses: microsoft/setup-msbuild@v1.0.2
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      # See https://github.com/Jimver/cuda-toolkit/blob/master/src/links/windows-links.ts
+      # for available CUDA versions
+      - uses: Jimver/cuda-toolkit@v0.2.7
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda }}
+
+      - name: Display CUDA version
+        shell: bash
+        run: |
+          echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          nvcc --version
+
+      - name: Remove CUDA installation package
+        shell: bash
+        run: |
+          rm "C:/hostedtoolcache/windows/cuda_installer-windows/${{ matrix.cuda }}/x64/cuda_installer_${{ matrix.cuda }}.exe"
+
+      - name: Download cuDNN
+        shell: bash
+        run: |
+          GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/cudnn-for-windows
+          cd cudnn-for-windows
+          git lfs pull --include="cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip"
+          unzip cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip
+          rm cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip
+          ls -lh *
+          ls -lh */*
+
+          echo "PWD: $PWD"
+
+      - name: Install PyTorch ${{ matrix.torch }}
+        shell: bash
+        run: |
+          version=${{ matrix.cuda }}
+          major=${version:0:2}
+          minor=${version:3:1}
+          v=${major}${minor}
+          if [ ${v} -eq 102 ]; then v=""; else v="+cu${v}"; fi
+
+          python3 -m pip install -qq --upgrade pip
+          python3 -m pip install -qq wheel twine numpy typing_extensions
+          python3 -m pip install -qq dataclasses soundfile numpy
+          python3 -m pip install -qq torch==${{ matrix.torch }}${v} -f https://download.pytorch.org/whl/torch_stable.html numpy
+
+          python3 -c "import torch; print('torch version:', torch.__version__)"
+
+          python3 -m torch.utils.collect_env
+
+      - name: Display CMake version
+        run: |
+          cmake --version
+          cmake --help
+
+      - name: Configure CMake
+        shell: bash
+        run: |
+          echo "PWD: $PWD"
+          ls -lh
+
+          mkdir build_release
+          cd build_release
+          cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCUDNN_INCLUDE_PATH=d:/a/kaldifeat/kaldifeat/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/include -DCUDNN_LIBRARY_PATH=d:/a/kaldifeat/kaldifeat/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/lib/cudnn.lib ..
+          ls -lh
+
+      - name: Build kaldifeat
+        shell: bash
+        run: |
+          cd build_release
+          cmake --build . --target _kaldifeat --config Release
+
+      - name: Display generated files
+        shell: bash
+        run: |
+          cd build_release
+          ls -lh lib/*/*
+
+      - name: Build wheel
+        shell: bash
+        run: |
+          echo $PWD
+          ls -lh ./*
+          export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCUDNN_INCLUDE_PATH=d:/a/kaldifeat/kaldifeat/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/include -DCUDNN_LIBRARY_PATH=d:/a/kaldifeat/kaldifeat/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/lib/cudnn.lib"
+          python3 setup.py bdist_wheel
+          ls -lh dist/
+          pip install ./dist/*.whl
+          python3 -c "import kaldifeat; print(kaldifeat.__version__)"
+
+      - name: Upload Wheel
+        uses: actions/upload-artifact@v2
+        with:
+          name: python-${{ matrix.python-version }}-${{ matrix.os }}-cuda-${{ matrix.cuda }}
+          path: dist/*.whl
+
+      - name: Build tests
+        shell: bash
+        run: |
+          cd build_release
+          cmake --build . --target ALL_BUILD --config Release
+          ls -lh bin/*/*
+          ctest -C Release --verbose --output-on-failure
diff --git a/.gitignore b/.gitignore
index 52da5e5..d6c034b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ dist/
 __pycache__/
 test-1hour.wav
 path.sh
+torch_version.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc4854a..f47e27d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,8 @@ project(kaldifeat)
 
 # remember to change the version in
 # scripts/conda/kaldifeat/meta.yaml
-set(kaldifeat_VERSION "1.14")
+# scripts/conda-cpu/kaldifeat/meta.yaml
+set(kaldifeat_VERSION "1.16")
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
@@ -51,5 +52,23 @@ if(kaldifeat_BUILD_TESTS)
   enable_testing()
 endif()
 
+# TORCH_VERSION is defined in cmake/torch.cmake
+configure_file(
+  ${CMAKE_SOURCE_DIR}/kaldifeat/python/kaldifeat/torch_version.py.in
+  ${CMAKE_SOURCE_DIR}/kaldifeat/python/kaldifeat/torch_version.py @ONLY
+)
+
+if(WIN32)
+  # disable various warnings for MSVC
+  # 4624: destructor was implicitly defined as deleted because a base class destructor is inaccessible or deleted
+  set(disabled_warnings
+      /wd4624
+  )
+  message(STATUS "Disabled warnings: ${disabled_warnings}")
+  foreach(w IN LISTS disabled_warnings)
+    string(APPEND CMAKE_CXX_FLAGS " ${w} ")
+  endforeach()
+endif()
+
 include_directories(${CMAKE_SOURCE_DIR})
 add_subdirectory(kaldifeat)
diff --git a/README.md b/README.md
index 170c0e7..ab6c794 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,16 @@
 # kaldifeat
 
 <div align="center">
-<img src="/doc/source/images/os.svg">
+<img src="/doc/source/images/os-green.svg">
 <img src="/doc/source/images/python_ge_3.6-blue.svg">
 <img src="/doc/source/images/pytorch_ge_1.5.0-green.svg">
 <img src="/doc/source/images/cuda_ge_10.1-orange.svg">
 </div>
 
+[![Documentation Status](https://github.com/csukuangfj/kaldifeat/actions/workflows/build-doc.yml/badge.svg)](https://csukuangfj.github.io/kaldifeat/)
+
+**Documentation**: <https://csukuangfj.github.io/kaldifeat>
+
 <sub>
 <table>
 <tr>
@@ -268,67 +272,15 @@ See <https://github.com/k2-fsa/k2/blob/v2.0-pre/k2/torch/csrc/features.cu>.
 
 See <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/features/kaldifeat.py>.
 
+## sherpa
+
+[sherpa](https://github.com/k2-fsa/sherpa) uses kaldifeat for streaming feature
+extraction.
+
+See <https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/pruned_stateless_emformer_rnnt2/decode.py>
+
 # Installation
 
-## From conda (Only for Linux + CUDA)
-
-Supported versions of Python, PyTorch, and CUDA toolkit are listed below:
-
-[![Supported Python versions](/doc/source/images/python-3.6_3.7_3.8-blue.svg)](/doc/source/images/python-3.6_3.7_3.8-blue.svg)
-[![Supported PyTorch versions](/doc/source/images/pytorch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg)](/doc/source/images/pytorch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg)
-[![Supported CUDA versions](/doc/source/images/cuda-10.1_10.2_11.0_11.1-orange.svg)](/doc/source/images/cuda-10.1_10.2_11.0_11.1-orange.svg)
-
-```bash
-conda install -c kaldifeat -c pytorch -c conda-forge kaldifeat python=3.8 cudatoolkit=11.1 pytorch=1.8.1
-```
-
-You can select the supported Python version, CUDA toolkit version and PyTorch version as you wish.
-
-**Note:** If you want a CPU only version or want to install `kaldifeat` on other operating systems,
-e.g., macOS, please use `pip install` or compile `kaldifeat` from source.
-
-
-## From PyPi with pip
-
-You need to install PyTorch and CMake first.
-CMake 3.11 is known to work. Other CMake versions may also work.
-PyTorch 1.6.0 and above are known to work. Other PyTorch versions may also work.
-
-```bash
-pip install -v kaldifeat
-```
-
-## From source
-
-The following are the commands to compile `kaldifeat` from source.
-We assume that you have installed `CMake` and PyTorch.
-CMake 3.11 is known to work. Other CMake versions may also work.
-PyTorch 1.6.0 and above are known to work. Other PyTorch versions may also work.
-
-```bash
-mkdir /some/path
-git clone https://github.com/csukuangfj/kaldifeat.git
-cd kaldifeat
-python setup.py install
-```
-
-To test whether `kaldifeat` was installed successfully, you can run:
-```bash
-python3 -c "import kaldifeat; print(kaldifeat.__version__)"
-```
-
-## FAQs
-
-### How to install a CUDA version
-
-There are two approaches:
-
-  - (1) Install using `conda`. It always installs a CUDA version of kaldifeat.
-  - (2) Install a CUDA version of PyTorch and then install kaldifeat from source
-    or use `pip install kaldifeat`.
-
-
-### How to install a CPU-only version
-
-You have to first install a CPU-only version of PyTorch and then install kaldifeat
-from source or use `pip install kaldifeat`.
+Refer to
+<https://csukuangfj.github.io/kaldifeat>
+for installation.
diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py
index 8bd21ca..0d14815 100644
--- a/cmake/cmake_extension.py
+++ b/cmake/cmake_extension.py
@@ -128,3 +128,11 @@ class BuildExtension(build_ext):
         for so in lib_so:
             print(f"Copying {so} to {self.build_lib}/")
             shutil.copy(f"{so}", f"{self.build_lib}/")
+
+        print(
+            f"Copying {kaldifeat_dir}/kaldifeat/python/kaldifeat/torch_version.py to {self.build_lib}/kaldifeat"  # noqa
+        )
+        shutil.copy(
+            f"{kaldifeat_dir}/kaldifeat/python/kaldifeat/torch_version.py",
+            f"{self.build_lib}/kaldifeat",
+        )
diff --git a/cmake/pybind11.cmake b/cmake/pybind11.cmake
index 4cad4e8..aa99e6a 100644
--- a/cmake/pybind11.cmake
+++ b/cmake/pybind11.cmake
@@ -8,12 +8,9 @@ function(download_pybind11)
 
   include(FetchContent)
 
-  set(pybind11_URL  "https://github.com/pybind/pybind11/archive/v2.6.0.tar.gz")
-  set(pybind11_HASH "SHA256=90b705137b69ee3b5fc655eaca66d0dc9862ea1759226f7ccd3098425ae69571")
+  set(pybind11_URL  "https://github.com/pybind/pybind11/archive/v2.9.2.tar.gz")
+  set(pybind11_HASH "SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1")
 
-  set(double_quotes "\"")
-  set(dollar "\$")
-  set(semicolon "\;")
   FetchContent_Declare(pybind11
     URL               ${pybind11_URL}
     URL_HASH          ${pybind11_HASH}
diff --git a/doc/source/_static/.gitkeep b/doc/source/_static/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/doc/source/code/test_fbank.py b/doc/source/code/test_fbank.py
deleted file mode 100755
index 0f39a1c..0000000
--- a/doc/source/code/test_fbank.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright      2021  Xiaomi Corporation (authors: Fangjun Kuang)
-
-import numpy as np
-import soundfile as sf
-import torch
-
-import kaldifeat
-
-
-def read_wave(filename) -> torch.Tensor:
-    """Read a wave file and return it as a 1-D tensor.
-
-    Note:
-      You don't need to scale it to [-32768, 32767].
-      We use scaling here to follow the approach in Kaldi.
-
-    Args:
-      filename:
-        Filename of a sound file.
-    Returns:
-      Return a 1-D tensor containing audio samples.
-    """
-    with sf.SoundFile(filename) as sf_desc:
-        sampling_rate = sf_desc.samplerate
-        assert sampling_rate == 16000
-        data = sf_desc.read(dtype=np.float32, always_2d=False)
-    data *= 32768
-    return torch.from_numpy(data)
-
-
-def test_fbank():
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-
-    wave0 = read_wave("test_data/test.wav")
-    wave1 = read_wave("test_data/test2.wav")
-
-    wave0 = wave0.to(device)
-    wave1 = wave1.to(device)
-
-    opts = kaldifeat.FbankOptions()
-    opts.frame_opts.dither = 0
-    opts.device = device
-
-    fbank = kaldifeat.Fbank(opts)
-
-    # We can compute fbank features in batches
-    features = fbank([wave0, wave1])
-    assert isinstance(features, list), f"{type(features)}"
-    assert len(features) == 2
-
-    # We can also compute fbank features for a single wave
-    features0 = fbank(wave0)
-    features1 = fbank(wave1)
-
-    assert torch.allclose(features[0], features0)
-    assert torch.allclose(features[1], features1)
-
-    # To compute fbank features for only a specified frame
-    audio_frames = fbank.convert_samples_to_frames(wave0)
-    feature_frame_1 = fbank.compute(audio_frames[1])
-    feature_frame_10 = fbank.compute(audio_frames[10])
-
-    assert torch.allclose(features0[1], feature_frame_1)
-    assert torch.allclose(features0[10], feature_frame_10)
-
-
-if __name__ == "__main__":
-    test_fbank()
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 2ec9ca6..fef6d6f 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -59,7 +59,7 @@ templates_path = ["_templates"]
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
+exclude_patterns = ["images/*.md"]
 
 source_suffix = {
     ".rst": "restructuredtext",
@@ -102,3 +102,35 @@ html_theme_options = {
     "prev_next_buttons_location": "bottom",
     "style_external_links": True,
 }
+
+rst_epilog = """
+.. _kaldifeat: https://github.com/csukuangfj/kaldifeat
+.. _Kaldi: https://github.com/kaldi-asr/kaldi
+.. _PyTorch: https://pytorch.org/
+.. _kaldifeat.Fbank: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/fbank.py#L10
+.. _kaldifeat.Mfcc: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/mfcc.py#L10
+.. _kaldifeat.Plp: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/plp.py#L10
+.. _kaldifeat.Spectrogram: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/spectrogram.py#L9
+.. _kaldifeat.OnlineFbank: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/fbank.py#L16
+.. _kaldifeat.OnlineMfcc: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/mfcc.py#L16
+.. _kaldifeat.OnlinePlp: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/kaldifeat/plp.py#L16
+.. _compute-fbank-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-fbank-feats.cc
+.. _compute-mfcc-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-mfcc-feats.cc
+.. _compute-plp-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-plp-feats.cc
+.. _compute-spectrogram-feats: https://github.com/kaldi-asr/kaldi/blob/master/src/featbin/compute-spectrogram-feats.cc
+.. _kaldi::OnlineFbank: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/online-feature.h#L160
+.. _kaldi::OnlineMfcc: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/online-feature.h#L158
+.. _kaldi::OnlinePlp: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/online-feature.h#L159
+.. _kaldifeat.FbankOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-fbank.h#L19
+.. _kaldi::FbankOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.h#L41
+.. _kaldifeat.MfccOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-mfcc.h#L22
+.. _kaldi::MfccOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-mfcc.h#L38
+.. _kaldifeat.PlpOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-plp.h#L24
+.. _kaldi::PlpOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-plp.h#L42
+.. _kaldifeat.SpectrogramOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-spectrogram.h#L18
+.. _kaldi::SpectrogramOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-spectrogram.h#L38
+.. _kaldifeat.FrameExtractionOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/feature-window.h#L30
+.. _kaldi::FrameExtractionOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-window.h#L35
+.. _kaldifeat.MelBanksOptions: https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/csrc/mel-computations.h#L17
+.. _kaldi::MelBanksOptions: https://github.com/kaldi-asr/kaldi/blob/master/src/feat/mel-computations.h#L43
+"""
diff --git a/doc/source/images/README.md b/doc/source/images/README.md
new file mode 100644
index 0000000..542998b
--- /dev/null
+++ b/doc/source/images/README.md
@@ -0,0 +1,8 @@
+## File description
+
+<https://shields.io/> is used to create the following files:
+
+- ./os.svg
+- ./python_ge_3.6-blue.svg
+- ./cuda_ge_10.1-orange.svg
+- ./pytorch_ge_1.5.0-green.svg
diff --git a/doc/source/images/os-green.svg b/doc/source/images/os-green.svg
new file mode 100644
index 0000000..b78017a
--- /dev/null
+++ b/doc/source/images/os-green.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="176" height="20" role="img" aria-label="os: Linux | macOS | Windows"><title>os: Linux | macOS | Windows</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="176" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="23" height="20" fill="#555"/><rect x="23" width="153" height="20" fill="#97ca00"/><rect width="176" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="125" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="130">os</text><text x="125" y="140" transform="scale(.1)" fill="#fff" textLength="130">os</text><text aria-hidden="true" x="985" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="1430">Linux | macOS | Windows</text><text x="985" y="140" transform="scale(.1)" fill="#fff" textLength="1430">Linux | macOS | Windows</text></g></svg>
diff --git a/doc/source/images/os.svg b/doc/source/images/os.svg
deleted file mode 100644
index 314bf44..0000000
--- a/doc/source/images/os.svg
+++ /dev/null
@@ -1 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="118" height="20" role="img" aria-label="OS: Linux | macOS"><title>OS: Linux | macOS</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="118" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="27" height="20" fill="#555"/><rect x="27" width="91" height="20" fill="#4c1"/><rect width="118" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="145" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="170">OS</text><text x="145" y="140" transform="scale(.1)" fill="#fff" textLength="170">OS</text><text aria-hidden="true" x="715" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="810">Linux | macOS</text><text x="715" y="140" transform="scale(.1)" fill="#fff" textLength="810">Linux | macOS</text></g></svg>
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 436e1b8..caa50b5 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -6,19 +6,11 @@
 kaldifeat
 =========
 
-`kaldifeat <https://github.com/csukuangfj/kaldifeat>`_ implements
-feature extraction algorithms **compatible** with kaldi using PyTorch, supporting CUDA
-as well as autograd.
-
-Currently, only fbank features are supported.
-It can produce the same feature output as ``compute-fbank-feats`` (from kaldi)
-when given the same options.
-
-
 
 .. toctree::
    :maxdepth: 2
-   :caption: Contents:
+   :caption: Contents
 
+   intro
    installation
-   usage
+   usage/index
diff --git a/doc/source/installation.rst b/doc/source/installation.rst
index 9e4bfc8..5baa217 100644
--- a/doc/source/installation.rst
+++ b/doc/source/installation.rst
@@ -1,19 +1,38 @@
 Installation
 ============
 
+    - |os_types|
+    - |python_versions|
+    - |pytorch_versions|
+    - |cuda_versions|
+
+.. caution::
+
+   `kaldifeat`_ depends on `PyTorch`_. `PyTorch`_ >= 1.5.0 is known to work.
+
+   Please first install `PyTorch`_ before you install `kaldifeat`_.
+
+.. hint::
+
+   To install a CPU version of `kaldifeat`_, please install a CPU version
+   of `PyTorch`_.
+
+   To install a CUDA version of `kaldifeat`_, please install a CUDA version
+   of `PyTorch`_. CUDA >= 10.1 is known to work.
+
 .. _from source:
 
 Install kaldifeat from source
 -----------------------------
 
-You have to install ``cmake`` and ``PyTorch`` first.
+You have to install ``cmake`` and `PyTorch`_ first.
 
   - ``cmake`` 3.11 is known to work. Other CMake versions may also work.
-  - ``PyTorch`` 1.8.1 is known to work. Other PyTorch versions may also work.
+  - `PyTorch`_ >= 1.5.0  is known to work. Other PyTorch versions may also work.
   - Python >= 3.6
 
 
-The commands to install ``kaldifeat`` from source are:
+The commands to install `kaldifeat`_ from source are:
 
 .. code-block:: bash
 
@@ -21,7 +40,7 @@ The commands to install ``kaldifeat`` from source are:
   cd kaldifeat
   python3 setup.py install
 
-To test that you have installed ``kaldifeat`` successfully, please run:
+To test that you have installed `kaldifeat`_ successfully, please run:
 
 .. code-block:: bash
 
@@ -29,26 +48,120 @@ To test that you have installed ``kaldifeat`` successfully, please run:
 
 It should print the version, e.g., ``1.0``.
 
+.. _from PyPI:
+
 Install kaldifeat from PyPI
 ---------------------------
 
-The pre-built ``kaldifeat`` hosted on PyPI uses PyTorch 1.8.1.
-If you install ``kaldifeat`` using pip, it will replace your locally
-installed PyTorch automatically with PyTorch 1.8.1.
-
-If you don't want this happen, please `Install kaldifeat from source`_.
-
-The command to install ``kaldifeat`` from PyPI is:
+The command to install `kaldifeat`_ from PyPI is:
 
 .. code-block:: bash
 
-  pip install kaldifeat
+  pip install --verbose kaldifeat
 
-
-To test that you have installed ``kaldifeat`` successfully, please run:
+To test that you have installed `kaldifeat`_ successfully, please run:
 
 .. code-block:: bash
 
   python3 -c "import kaldifeat; print(kaldifeat.__version__)"
 
 It should print the version, e.g., ``1.0``.
+
+Install kaldifeat from conda (Only for Linux)
+---------------------------------------------
+
+.. hint::
+
+   Installation using ``conda`` supports only Linux. For macOS and Windows,
+   please use either :ref:`from source` or :ref:`from PyPI`.
+
+The command to install `kaldifeat` using ``conda`` is
+
+.. code-block:: bash
+
+  conda install -c kaldifeat -c pytorch -c conda-forge kaldifeat python=3.8 cudatoolkit=11.1 pytorch=1.8.1
+
+You can select the supported Python version, CUDA toolkit version and `PyTorch`_ version as you wish.
+
+To install a CPU version of `kaldifeat`, use:
+
+.. code-block:: bash
+
+  conda install -c kaldifeat -c pytorch cpuonly kaldifeat python=3.8 pytorch=1.8.1
+
+.. caution::
+
+   If you encounter issues about missing GLIBC after installing `kaldifeat`_
+   with ``conda``, please consider :ref:`from source` or :ref:`from PyPI`.
+   The reason is that the package was built using Ubuntu 18.04 and your system's
+   GLIBC is older.
+
+
+.. |os_types| image:: ./images/os-green.svg
+  :alt: Supported operating systems
+
+.. |python_versions| image:: ./images/python_ge_3.6-blue.svg
+  :alt: Supported python versions
+
+.. |cuda_versions| image:: ./images/cuda_ge_10.1-orange.svg
+  :alt: Supported cuda versions
+
+.. |pytorch_versions| image:: ./images/pytorch_ge_1.5.0-green.svg
+  :alt: Supported pytorch versions
+
+To test that you have installed `kaldifeat`_ successfully, please run:
+
+.. code-block:: bash
+
+  python3 -c "import kaldifeat; print(kaldifeat.__version__)"
+
+It should print the version, e.g., ``1.0``.
+
+FAQs
+----
+
+How to install a CUDA version of kaldifeat
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You need to first install a CUDA version of `PyTorch`_ and then install `kaldifeat`_.
+
+.. note::
+
+   You can use a CUDA version of `kaldifeat`_ on machines with no GPUs.
+
+How to install a CPU version of kaldifeat
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You need to first install a CPU version of `PyTorch`_ and then install `kaldifeat`_.
+
+How to fix `Caffe2: Cannot find cuDNN library`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block::
+
+  Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN
+  libraries.  Please set the proper cuDNN prefixes and / or install cuDNN.
+
+You will have such an error when you want to install a CUDA version of `kaldifeat`_
+by ``pip install kaldifeat`` or from source.
+
+You need to first install cuDNN. Assume you have installed cuDNN to the
+path ``/path/to/cudnn``. You can fix the error by using ``one`` of the following
+commands.
+
+(1) Fix for installation using ``pip install``
+
+.. code-block:: bash
+
+    export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DCUDNN_LIBRARY_PATH=/path/to/cudnn/lib/libcudnn.so -DCUDNN_INCLUDE_PATH=/path/to/cudnn/include"
+    pip install --verbose kaldifeat
+
+(2) Fix for installation from source
+
+.. code-block:: bash
+
+    mkdir /some/path
+    git clone https://github.com/csukuangfj/kaldifeat.git
+    cd kaldifeat
+    export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DCUDNN_LIBRARY_PATH=/path/to/cudnn/lib/libcudnn.so -DCUDNN_INCLUDE_PATH=/path/to/cudnn/include"
+    python setup.py install
diff --git a/doc/source/intro.rst b/doc/source/intro.rst
new file mode 100644
index 0000000..6e66c36
--- /dev/null
+++ b/doc/source/intro.rst
@@ -0,0 +1,103 @@
+Introduction
+============
+
+`kaldifeat`_ implements
+speech feature extraction algorithms **compatible** with `Kaldi`_ using `PyTorch`_,
+supporting CUDA as well as autograd.
+
+`kaldifeat`_ has the following features:
+
+  - Fully compatible with `Kaldi`_
+
+    .. note::
+
+      The underlying C++ code is copied & modified from `Kaldi`_ directly.
+      It is rewritten with `PyTorch` C++ APIs.
+
+  - Provide not only ``C++ APIs`` but also ``Python APIs``
+
+    .. note::
+
+      You can access `kaldifeat`_ from ``Python``.
+
+  - Support autograd
+  - Support ``CUDA`` and ``CPU``
+
+    .. note::
+
+      You can use CUDA for feature extraction.
+
+  - Support ``online`` (i.e., ``streaming``) and ``offline`` (i.e., ``non-streaming``)
+    feature extraction
+  - Support chunk-based processing
+
+    .. note::
+
+      This is especially usefull if you want to process audios of several
+      hours long, which may cause OOM if you send them for computation at once.
+      With chunk-based processing, you can process audios of arbirtray length.
+
+  - Support batch processing
+
+    .. note::
+
+      With `kaldifeat`_ you can extract features for a batch of audios
+
+
+.. see https://sublime-and-sphinx-guide.readthedocs.io/en/latest/tables.html
+
+Currently implemented speech features and their counterparts in `Kaldi`_ are
+listed in the following table.
+
+.. list-table:: Supported speech features
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Supported speech features
+     - Counterpart in `Kaldi`_
+   * - `kaldifeat.Fbank`_
+     - `compute-fbank-feats`_
+   * - `kaldifeat.Mfcc`_
+     - `compute-mfcc-feats`_
+   * - `kaldifeat.Plp`_
+     - `compute-plp-feats`_
+   * - `kaldifeat.Spectrogram`_
+     - `compute-spectrogram-feats`_
+   * - `kaldifeat.OnlineFbank`_
+     - `kaldi::OnlineFbank`_
+   * - `kaldifeat.OnlineMfcc`_
+     - `kaldi::OnlineMfcc`_
+   * - `kaldifeat.OnlinePlp`_
+     - `kaldi::OnlinePlp`_
+
+Each feature computer needs an option. The following table lists the options
+for each computer and the corresponding options in `Kaldi`_.
+
+.. hint::
+
+   Note that we reuse the parameter names from `Kaldi`_.
+
+   Also, both online feature computers and offline feature computers share the
+   same option.
+
+.. list-table:: Feature computer options
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Options in `kaldifeat`_
+     - Corresponding options in `Kaldi`_
+   * - `kaldifeat.FbankOptions`_
+     - `kaldi::FbankOptions`_
+   * - `kaldifeat.MfccOptions`_
+     - `kaldi::MfccOptions`_
+   * - `kaldifeat.PlpOptions`_
+     - `kaldi::PlpOptions`_
+   * - `kaldifeat.SpectrogramOptions`_
+     - `kaldi::SpectrogramOptions`_
+   * - `kaldifeat.FrameExtractionOptions`_
+     - `kaldi::FrameExtractionOptions`_
+   * - `kaldifeat.MelBanksOptions`_
+     - `kaldi::MelBanksOptions`_
+
+Read more to learn how to install `kaldifeat`_ and how to use each feature
+computer.
diff --git a/doc/source/usage.rst b/doc/source/usage.rst
deleted file mode 100644
index dd6a770..0000000
--- a/doc/source/usage.rst
+++ /dev/null
@@ -1,212 +0,0 @@
-Usage
-=====
-
-Let us first see the help message of kaldi's ``compute-fbank-feats``:
-
-.. code-block:: bash
-
-    $ compute-fbank-feats
-
-    Create Mel-filter bank (FBANK) feature files.
-    Usage:  compute-fbank-feats [options...] <wav-rspecifier> <feats-wspecifier>
-
-    Options:
-      --allow-downsample          : If true, allow the input waveform to have a higher frequency than the specified --sample-frequency (and we'll downsample). (bool, default = false)
-      --allow-upsample            : If true, allow the input waveform to have a lower frequency than the specified --sample-frequency (and we'll upsample). (bool, default = false)
-      --blackman-coeff            : Constant coefficient for generalized Blackman window. (float, default = 0.42)
-      --channel                   : Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (int, default = -1)
-      --debug-mel                 : Print out debugging information for mel bin computation (bool, default = false)
-      --dither                    : Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
-      --energy-floor              : Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
-      --frame-length              : Frame length in milliseconds (float, default = 25)
-      --frame-shift               : Frame shift in milliseconds (float, default = 10)
-      --high-freq                 : High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
-      --htk-compat                : If true, put energy last.  Warning: not sufficient to get HTK compatible features (need to change other parameters). (bool, default = false)
-      --low-freq                  : Low cutoff frequency for mel bins (float, default = 20)
-      --max-feature-vectors       : Memory optimization. If larger than 0, periodically remove feature vectors so that only this number of the latest feature vectors is retained. (int, default = -1)
-      --min-duration              : Minimum duration of segments to process (in seconds). (float, default = 0)
-      --num-mel-bins              : Number of triangular mel-frequency bins (int, default = 23)
-      --output-format             : Format of the output files [kaldi, htk] (string, default = "kaldi")
-      --preemphasis-coefficient   : Coefficient for use in signal preemphasis (float, default = 0.97)
-      --raw-energy                : If true, compute energy before preemphasis and windowing (bool, default = true)
-      --remove-dc-offset          : Subtract mean from waveform on each frame (bool, default = true)
-      --round-to-power-of-two     : If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
-      --sample-frequency          : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
-      --snip-edges                : If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
-      --subtract-mean             : Subtract mean of each feature file [CMS]; not recommended to do it this way.  (bool, default = false)
-      --use-energy                : Add an extra dimension with energy to the FBANK output. (bool, default = false)
-      --use-log-fbank             : If true, produce log-filterbank, else produce linear. (bool, default = true)
-      --use-power                 : If true, use power, else use magnitude. (bool, default = true)
-      --utt2spk                   : Utterance to speaker-id map (if doing VTLN and you have warps per speaker) (string, default = "")
-      --vtln-high                 : High inflection point in piecewise linear VTLN warping function (if negative, offset from high-mel-freq (float, default = -500)
-      --vtln-low                  : Low inflection point in piecewise linear VTLN warping function (float, default = 100)
-      --vtln-map                  : Map from utterance or speaker-id to vtln warp factor (rspecifier) (string, default = "")
-      --vtln-warp                 : Vtln warp factor (only applicable if vtln-map not specified) (float, default = 1)
-      --window-type               : Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
-      --write-utt2dur             : Wspecifier to write duration of each utterance in seconds, e.g. 'ark,t:utt2dur'. (string, default = "")
-
-    Standard options:
-      --config                    : Configuration file to read (this option may be repeated) (string, default = "")
-      --help                      : Print out usage message (bool, default = false)
-      --print-args                : Print the command line arguments (to stderr) (bool, default = true)
-      --verbose                   : Verbose level (higher->more logging) (int, default = 0)
-
-FbankOptions
-------------
-
-``kaldifeat`` reuses the same options from kaldi's ``compute-fbank-feats``.
-
-The following shows the default values of ``kaldifeat.FbankOptions``:
-
-.. code-block:: python
-
-  >>> import kaldifeat
-  >>> fbank_opts = kaldifeat.FbankOptions()
-  >>> print(fbank_opts)
-  frame_opts:
-  samp_freq: 16000
-  frame_shift_ms: 10
-  frame_length_ms: 25
-  dither: 1
-  preemph_coeff: 0.97
-  remove_dc_offset: 1
-  window_type: povey
-  round_to_power_of_two: 1
-  blackman_coeff: 0.42
-  snip_edges: 1
-
-
-  mel_opts:
-  num_bins: 23
-  low_freq: 20
-  high_freq: 0
-  vtln_low: 100
-  vtln_high: -500
-  debug_mel: 0
-  htk_mode: 0
-
-  use_energy: 0
-  energy_floor: 0
-  raw_energy: 1
-  htk_compat: 0
-  use_log_fbank: 1
-  use_power: 1
-  device: cpu
-
-It consists of three parts:
-
-  - ``frame_opts``
-
-    Options in this part are accessed by ``frame_opts.xxx``. That is, to access
-    the sample rate, you use:
-
-      .. code-block:: python
-
-        >>> fbank_opts = kaldifeat.FbankOptions()
-        >>> print(fbank_opts.frame_opts.samp_freq)
-        16000.0
-
-  - ``mel_opts``
-
-    Options in this part are accessed by ``mel_opts.xxx``. That is, to access
-    the number of mel bins, you use:
-
-      .. code-block:: python
-
-        >>> fbank_opts = kaldifeat.FbankOptions()
-        >>> print(fbank_opts.mel_opts.num_bins)
-        23
-
-  - fbank related
-
-    Options in this part are accessed directly. That is, to access the device
-    field, you use:
-
-      .. code-block::
-
-        >>> print(fbank_opts.device)
-        cpu
-        >>> fbank_opts.device = 'cuda:0'
-        >>> print(fbank_opts.device)
-        cuda:0
-        >>> import torch
-        >>> fbank_opts.device = torch.device('cuda', 0)
-        >>> print(fbank_opts.device)
-        cuda:0
-
-
-
-To change the sample rate to 8000, you can use:
-
-.. code-block:: python
-
-  >>> fbank_opts = kaldifeat.FbankOptions()
-  >>> print(fbank_opts.frame_opts.samp_freq)
-  16000.0
-  >>> fbank_opts.frame_opts.samp_freq = 8000
-  >>> print(fbank_opts.frame_opts.samp_freq)
-  8000.0
-
-To change ``snip_edges`` to ``False``, you can use:
-
-.. code-block:: python
-
-  >>> fbank_opts.frame_opts.snip_edges = False
-  >>> print(fbank_opts.frame_opts.snip_edges)
-  False
-
-To change number of mel bins to 80, you can use:
-
-.. code-block:: python
-
-  >>> print(fbank_opts.mel_opts.num_bins)
-  23
-  >>> fbank_opts.mel_opts.num_bins = 80
-  >>> print(fbank_opts.mel_opts.num_bins)
-  80
-
-To change the device to ``cuda``, you can use:
-
-
-Fbank
------
-
-The following shows how to use ``kaldifeat.Fbank`` to compute
-the fbank features of sound files.
-
-First, let us generate two sound files using ``sox``:
-
-.. code-block:: bash
-
-  # generate a wav of two seconds, containing a sine-wave
-  # swept from 300 Hz to 3300 Hz
-  sox -n -r 16000 -b 16 test.wav synth 1.2 sine 300-3300
-
-  # another sound file with 0.5 seconds
-  sox -n -r 16000 -b 16 test2.wav synth 0.5 sine 300-3300
-
-.. hint::
-
-  You can find the above two files by visiting the following two links:
-
-    - `test.wav <https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/tests/test_data/test.wav>`_
-    - `test2.wav <https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/tests/test_data/test2.wav>`_
-
-The `following code <https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/tests/test_fbank.py>`_
-shows the usage of ``kaldifeat.Fbank``.
-
-It shows:
-
-  - How to read a sound file. Note that audio samples are scaled to the range [-32768, 32768].
-    The intention is to produce the same output as kaldi. You don't need to scale it if
-    you don't care about the compatibility with kaldi
-
-  - ``kaldifeat.Fbank`` supports CUDA as well as CPU
-
-  - ``kaldifeat.Fbank`` supports processing sound file in a batch as well as accepting
-    a single sound file
-
-
-.. literalinclude:: ./code/test_fbank.py
-   :caption: Demo of ``kaldifeat.Fbank``
-   :language: python
diff --git a/doc/source/usage/code/compute-fbank-feats-help.txt b/doc/source/usage/code/compute-fbank-feats-help.txt
new file mode 100644
index 0000000..3922636
--- /dev/null
+++ b/doc/source/usage/code/compute-fbank-feats-help.txt
@@ -0,0 +1,46 @@
+compute-fbank-feats 
+
+Create Mel-filter bank (FBANK) feature files.
+Usage:  compute-fbank-feats [options...] <wav-rspecifier> <feats-wspecifier>
+
+Options:
+  --allow-downsample          : If true, allow the input waveform to have a higher frequency than the specified --sample-frequency (and we'll downsample). (bool, default = false)
+  --allow-upsample            : If true, allow the input waveform to have a lower frequency than the specified --sample-frequency (and we'll upsample). (bool, default = false)
+  --blackman-coeff            : Constant coefficient for generalized Blackman window. (float, default = 0.42)
+  --channel                   : Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (int, default = -1)
+  --debug-mel                 : Print out debugging information for mel bin computation (bool, default = false)
+  --dither                    : Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
+  --energy-floor              : Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
+  --frame-length              : Frame length in milliseconds (float, default = 25)
+  --frame-shift               : Frame shift in milliseconds (float, default = 10)
+  --high-freq                 : High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
+  --htk-compat                : If true, put energy last.  Warning: not sufficient to get HTK compatible features (need to change other parameters). (bool, default = false)
+  --low-freq                  : Low cutoff frequency for mel bins (float, default = 20)
+  --max-feature-vectors       : Memory optimization. If larger than 0, periodically remove feature vectors so that only this number of the latest feature vectors is retained. (int, default = -1)
+  --min-duration              : Minimum duration of segments to process (in seconds). (float, default = 0)
+  --num-mel-bins              : Number of triangular mel-frequency bins (int, default = 23)
+  --output-format             : Format of the output files [kaldi, htk] (string, default = "kaldi")
+  --preemphasis-coefficient   : Coefficient for use in signal preemphasis (float, default = 0.97)
+  --raw-energy                : If true, compute energy before preemphasis and windowing (bool, default = true)
+  --remove-dc-offset          : Subtract mean from waveform on each frame (bool, default = true)
+  --round-to-power-of-two     : If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
+  --sample-frequency          : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
+  --snip-edges                : If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
+  --subtract-mean             : Subtract mean of each feature file [CMS]; not recommended to do it this way.  (bool, default = false)
+  --use-energy                : Add an extra dimension with energy to the FBANK output. (bool, default = false)
+  --use-log-fbank             : If true, produce log-filterbank, else produce linear. (bool, default = true)
+  --use-power                 : If true, use power, else use magnitude. (bool, default = true)
+  --utt2spk                   : Utterance to speaker-id map (if doing VTLN and you have warps per speaker) (string, default = "")
+  --vtln-high                 : High inflection point in piecewise linear VTLN warping function (if negative, offset from high-mel-freq (float, default = -500)
+  --vtln-low                  : Low inflection point in piecewise linear VTLN warping function (float, default = 100)
+  --vtln-map                  : Map from utterance or speaker-id to vtln warp factor (rspecifier) (string, default = "")
+  --vtln-warp                 : Vtln warp factor (only applicable if vtln-map not specified) (float, default = 1)
+  --window-type               : Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
+  --write-utt2dur             : Wspecifier to write duration of each utterance in seconds, e.g. 'ark,t:utt2dur'. (string, default = "")
+
+Standard options:
+  --config                    : Configuration file to read (this option may be repeated) (string, default = "")
+  --help                      : Print out usage message (bool, default = false)
+  --print-args                : Print the command line arguments (to stderr) (bool, default = true)
+  --verbose                   : Verbose level (higher->more logging) (int, default = 0)
+
diff --git a/doc/source/usage/code/fbank_options-1.txt b/doc/source/usage/code/fbank_options-1.txt
new file mode 100644
index 0000000..7e0470a
--- /dev/null
+++ b/doc/source/usage/code/fbank_options-1.txt
@@ -0,0 +1,65 @@
+$ python3
+Python 3.8.0 (default, Oct 28 2019, 16:14:01)
+[GCC 8.3.0] on linux
+Type "help", "copyright", "credits" or "license" for more information.
+>>> import kaldifeat
+>>> opts = kaldifeat.FbankOptions()
+>>> print(opts)
+frame_opts:
+samp_freq: 16000
+frame_shift_ms: 10
+frame_length_ms: 25
+dither: 1
+preemph_coeff: 0.97
+remove_dc_offset: 1
+window_type: povey
+round_to_power_of_two: 1
+blackman_coeff: 0.42
+snip_edges: 1
+max_feature_vectors: -1
+
+
+mel_opts:
+num_bins: 23
+low_freq: 20
+high_freq: 0
+vtln_low: 100
+vtln_high: -500
+debug_mel: 0
+htk_mode: 0
+
+use_energy: 0
+energy_floor: 0
+raw_energy: 1
+htk_compat: 0
+use_log_fbank: 1
+use_power: 1
+device: cpu
+
+>>> print(opts.dither)
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+AttributeError: '_kaldifeat.FbankOptions' object has no attribute 'dither'
+>>>
+>>> print(opts.frame_opts.dither)
+1.0
+>>> opts.frame_opts.dither = 0 # disable dither
+>>> print(opts.frame_opts.dither)
+0.0
+>>> import torch
+>>> print(opts.device)
+cpu
+>>> opts.device = 'cuda:0'
+>>> print(opts.device)
+cuda:0
+>>> opts.device = torch.device('cuda', 1)
+>>> print(opts.device)
+cuda:1
+>>> opts.device = 'cpu'
+>>> print(opts.device)
+cpu
+>>> print(opts.mel_opts.num_bins)
+23
+>>> opts.mel_opts.num_bins = 80
+>>> print(opts.mel_opts.num_bins)
+80
diff --git a/doc/source/usage/code/test_fbank_options.py b/doc/source/usage/code/test_fbank_options.py
new file mode 120000
index 0000000..3bfe0fa
--- /dev/null
+++ b/doc/source/usage/code/test_fbank_options.py
@@ -0,0 +1 @@
+../../../../kaldifeat/python/tests/test_fbank_options.py
\ No newline at end of file
diff --git a/doc/source/usage/fbank.rst b/doc/source/usage/fbank.rst
new file mode 100644
index 0000000..e3f1351
--- /dev/null
+++ b/doc/source/usage/fbank.rst
@@ -0,0 +1,3 @@
+kaldifeat.Fbank
+===============
+
diff --git a/doc/source/usage/fbank_options.rst b/doc/source/usage/fbank_options.rst
new file mode 100644
index 0000000..d9adc2d
--- /dev/null
+++ b/doc/source/usage/fbank_options.rst
@@ -0,0 +1,52 @@
+kaldifeat.FbankOptions
+======================
+
+If you want to construct an instance of `kaldifeat.Fbank`_ or
+`kaldifeat.OnlineFbank`_, you have to provide an instance of
+`kaldifeat.FbankOptions`_.
+
+The following code shows how to construct an instance of `kaldifeat.FbankOptions`_.
+
+.. literalinclude:: ./code/fbank_options-1.txt
+   :caption: Usage of `kaldifeat.FbankOptions`_
+   :emphasize-lines: 6,8,22,37
+   :language: python
+
+Note that we reuse the same option name with `compute-fbank-feats`_ from `Kaldi`_:
+
+.. code-block:: bash
+
+   $ compute-fbank-feats --help
+
+
+.. literalinclude:: ./code/compute-fbank-feats-help.txt
+   :caption: Output of ``compute-fbank-feats --help``
+
+Please refer to the output of ``compute-fbank-feats --help`` for the meaning
+of each field of `kaldifeat.FbankOptions`_.
+
+One thing worth noting is that `kaldifeat.FbankOptions`_ has a field ``device``,
+which is an instance of ``torch.device``. You can assign it either a string, e.g.,
+``"cpu"`` or ``"cuda:0"``, or an instance of ``torch.device``, e.g., ``torch.device("cpu")`` or
+``torch.device("cuda", 1)``.
+
+.. hint::
+
+   You can use this field to control whether the feature computer
+   constructed from it performs computation on CPU or CUDA.
+
+.. caution::
+
+   If you use a CUDA device, make sure that you have installed a CUDA version
+   of `PyTorch`_.
+
+Example usage
+-------------
+
+The following code from
+`<https://github.com/csukuangfj/kaldifeat/blob/master/kaldifeat/python/tests/test_fbank_options.py>`_
+demonstrate the usage of `kaldifeat.FbankOptions`_:
+
+.. literalinclude:: ./code/test_fbank_options.py
+   :caption: Example usage of `kaldifeat.FbankOptions`_
+   :language: python
diff --git a/doc/source/usage/index.rst b/doc/source/usage/index.rst
new file mode 100644
index 0000000..f40dcd5
--- /dev/null
+++ b/doc/source/usage/index.rst
@@ -0,0 +1,11 @@
+Usage
+=====
+
+This section describes how to use feature computers in `kaldifeat`_.
+
+.. toctree::
+   :maxdepth: 2
+
+   fbank_options
+   fbank
+   online_fbank
diff --git a/doc/source/usage/online_fbank.rst b/doc/source/usage/online_fbank.rst
new file mode 100644
index 0000000..557104d
--- /dev/null
+++ b/doc/source/usage/online_fbank.rst
@@ -0,0 +1,3 @@
+kaldifeat.OnlineFbank
+=====================
+
diff --git a/kaldifeat/csrc/CMakeLists.txt b/kaldifeat/csrc/CMakeLists.txt
index 39f2c1c..9900b96 100644
--- a/kaldifeat/csrc/CMakeLists.txt
+++ b/kaldifeat/csrc/CMakeLists.txt
@@ -31,9 +31,15 @@ function(kaldifeat_add_test source)
       gtest_main
   )
 
+  # NOTE: We set the working directory here so that
+  # it works also on windows. The reason is that
+  # the required DLLs are inside ${TORCH_DIR}/lib
+  # and they can be found by the exe if the current
+  # working directory is ${TORCH_DIR}\lib
   add_test(NAME "Test.${name}"
     COMMAND
     $<TARGET_FILE:${name}>
+    WORKING_DIRECTORY ${TORCH_DIR}/lib
   )
 endfunction()
 
@@ -47,4 +53,5 @@ if(kaldifeat_BUILD_TESTS)
   foreach(source IN LISTS test_srcs)
     kaldifeat_add_test(${source})
   endforeach()
+
 endif()
diff --git a/kaldifeat/python/csrc/CMakeLists.txt b/kaldifeat/python/csrc/CMakeLists.txt
index c80637c..1403e6d 100644
--- a/kaldifeat/python/csrc/CMakeLists.txt
+++ b/kaldifeat/python/csrc/CMakeLists.txt
@@ -10,6 +10,17 @@ pybind11_add_module(_kaldifeat
   online-feature.cc
   utils.cc
 )
+
+if(APPLE)
+  execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE PYTHON_SITE_PACKAGE_DIR
+  )
+  message(STATUS "PYTHON_SITE_PACKAGE_DIR: ${PYTHON_SITE_PACKAGE_DIR}")
+  target_link_libraries(_kaldifeat PRIVATE "-Wl,-rpath,${PYTHON_SITE_PACKAGE_DIR}")
+endif()
+
 target_link_libraries(_kaldifeat PRIVATE kaldifeat_core)
 if(UNIX AND NOT APPLE)
   target_link_libraries(_kaldifeat PUBLIC ${TORCH_DIR}/lib/libtorch_python.so)
diff --git a/kaldifeat/python/csrc/feature-fbank.cc b/kaldifeat/python/csrc/feature-fbank.cc
index dcc9b14..6e52f0c 100644
--- a/kaldifeat/python/csrc/feature-fbank.cc
+++ b/kaldifeat/python/csrc/feature-fbank.cc
@@ -53,7 +53,7 @@ static void PybindFbank(py::module &m) {
       .def("dim", &PyClass::Dim)
       .def_property_readonly("options", &PyClass::GetOptions)
       .def("compute_features", &PyClass::ComputeFeatures, py::arg("wave"),
-           py::arg("vtln_warp"))
+           py::arg("vtln_warp"), py::call_guard<py::gil_scoped_release>())
       .def(py::pickle(
           [](const PyClass &self) -> py::dict {
             return AsDict(self.GetOptions());
diff --git a/kaldifeat/python/csrc/feature-mfcc.cc b/kaldifeat/python/csrc/feature-mfcc.cc
index 40d330e..fe893cb 100644
--- a/kaldifeat/python/csrc/feature-mfcc.cc
+++ b/kaldifeat/python/csrc/feature-mfcc.cc
@@ -53,7 +53,7 @@ static void PybindMfcc(py::module &m) {
       .def("dim", &PyClass::Dim)
       .def_property_readonly("options", &PyClass::GetOptions)
       .def("compute_features", &PyClass::ComputeFeatures, py::arg("wave"),
-           py::arg("vtln_warp"))
+           py::arg("vtln_warp"), py::call_guard<py::gil_scoped_release>())
       .def(py::pickle(
           [](const PyClass &self) -> py::dict {
             return AsDict(self.GetOptions());
diff --git a/kaldifeat/python/csrc/feature-plp.cc b/kaldifeat/python/csrc/feature-plp.cc
index abc5595..364ef93 100644
--- a/kaldifeat/python/csrc/feature-plp.cc
+++ b/kaldifeat/python/csrc/feature-plp.cc
@@ -56,7 +56,7 @@ static void PybindPlp(py::module &m) {
       .def("dim", &PyClass::Dim)
       .def_property_readonly("options", &PyClass::GetOptions)
       .def("compute_features", &PyClass::ComputeFeatures, py::arg("wave"),
-           py::arg("vtln_warp"))
+           py::arg("vtln_warp"), py::call_guard<py::gil_scoped_release>())
       .def(py::pickle(
           [](const PyClass &self) -> py::dict {
             return AsDict(self.GetOptions());
diff --git a/kaldifeat/python/csrc/feature-spectrogram.cc b/kaldifeat/python/csrc/feature-spectrogram.cc
index 62aa909..24b156b 100644
--- a/kaldifeat/python/csrc/feature-spectrogram.cc
+++ b/kaldifeat/python/csrc/feature-spectrogram.cc
@@ -53,7 +53,7 @@ static void PybindSpectrogram(py::module &m) {
       .def("dim", &PyClass::Dim)
       .def_property_readonly("options", &PyClass::GetOptions)
       .def("compute_features", &PyClass::ComputeFeatures, py::arg("wave"),
-           py::arg("vtln_warp"))
+           py::arg("vtln_warp"), py::call_guard<py::gil_scoped_release>())
       .def(py::pickle(
           [](const PyClass &self) -> py::dict {
             return AsDict(self.GetOptions());
diff --git a/kaldifeat/python/csrc/online-feature.cc b/kaldifeat/python/csrc/online-feature.cc
index 13e4a4f..2d1dcd8 100644
--- a/kaldifeat/python/csrc/online-feature.cc
+++ b/kaldifeat/python/csrc/online-feature.cc
@@ -22,9 +22,11 @@ void PybindOnlineFeatureTpl(py::module &m, const std::string &class_name,
       .def_property_readonly("num_frames_ready", &PyClass::NumFramesReady)
       .def("is_last_frame", &PyClass::IsLastFrame, py::arg("frame"))
       .def("get_frame", &PyClass::GetFrame, py::arg("frame"))
-      .def("get_frames", &PyClass::GetFrames, py::arg("frames"))
+      .def("get_frames", &PyClass::GetFrames, py::arg("frames"),
+           py::call_guard<py::gil_scoped_release>())
       .def("accept_waveform", &PyClass::AcceptWaveform,
-           py::arg("sampling_rate"), py::arg("waveform"))
+           py::arg("sampling_rate"), py::arg("waveform"),
+           py::call_guard<py::gil_scoped_release>())
       .def("input_finished", &PyClass::InputFinished);
 }
 
diff --git a/kaldifeat/python/kaldifeat/__init__.py b/kaldifeat/python/kaldifeat/__init__.py
index ea39003..adf7d79 100644
--- a/kaldifeat/python/kaldifeat/__init__.py
+++ b/kaldifeat/python/kaldifeat/__init__.py
@@ -1,4 +1,13 @@
 import torch
+
+from .torch_version import kaldifeat_torch_version
+
+if torch.__version__.split("+")[0] != kaldifeat_torch_version.split("+")[0]:
+    raise ImportError(
+        f"kaldifeat was built using PyTorch {kaldifeat_torch_version}\n"
+        f"But you are using PyTorch {torch.__version__} to run it"
+    )
+
 from _kaldifeat import (
     FbankOptions,
     FrameExtractionOptions,
diff --git a/kaldifeat/python/kaldifeat/torch_version.py.in b/kaldifeat/python/kaldifeat/torch_version.py.in
new file mode 100644
index 0000000..e6365fa
--- /dev/null
+++ b/kaldifeat/python/kaldifeat/torch_version.py.in
@@ -0,0 +1,12 @@
+# Auto generated by the toplevel CMakeLists.txt.
+#
+# DO NOT EDIT.
+
+# The torch version used to build kaldifeat. We will check it against the
+# torch version that is used to run kaldifeat. If they are not the same,
+# `import kaldifeat` will throw.
+#
+# Some example values are:
+# - 1.10.0+cu102
+# - 1.5.0+cpu
+kaldifeat_torch_version = "@TORCH_VERSION@"
diff --git a/kaldifeat/python/tests/Makefile b/kaldifeat/python/tests/Makefile
new file mode 100644
index 0000000..20f98ff
--- /dev/null
+++ b/kaldifeat/python/tests/Makefile
@@ -0,0 +1,13 @@
+
+.PHONY: test
+test:
+	python3 ./test_fbank.py
+	python3 ./test_fbank_options.py
+	python3 ./test_frame_extraction_options.py
+	python3 ./test_mel_bank_options.py
+	python3 ./test_mfcc.py
+	python3 ./test_mfcc_options.py
+	python3 ./test_plp.py
+	python3 ./test_plp_options.py
+	python3 ./test_spectrogram.py
+	python3 ./test_spectrogram_options.py
diff --git a/scripts/build_conda_cpu.sh b/scripts/build_conda_cpu.sh
new file mode 100755
index 0000000..3d2c47a
--- /dev/null
+++ b/scripts/build_conda_cpu.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+#
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The following environment variables are supposed to be set by users
+#
+# - KALDIFEAT_TORCH_VERSION
+#     The PyTorch version. Example:
+#
+#       export KALDIFEAT_TORCH_VERSION=1.7.1
+#
+#     Defaults to 1.7.1 if not set.
+#
+# - KALDIFEAT_CONDA_TOKEN
+#     If not set, auto upload to anaconda.org is disabled.
+#
+#     Its value is from https://anaconda.org/kaldifeat/settings/access
+#      (You need to login as user kaldifeat to see its value)
+#
+# - KALDIFEAT_BUILD_TYPE
+#     If not set, defaults to Release.
+
+set -e
+export CONDA_BUILD=1
+
+cur_dir=$(cd $(dirname $BASH_SOURCE) && pwd)
+kaldifeat_dir=$(cd $cur_dir/.. && pwd)
+
+cd $kaldifeat_dir
+
+export KALDIFEAT_ROOT_DIR=$kaldifeat_dir
+echo "KALDIFEAT_ROOT_DIR: $KALDIFEAT_ROOT_DIR"
+
+KALDIFEAT_PYTHON_VERSION=$(python3 -c "import sys; print(sys.version[:3])")
+
+if [ -z $KALDIFEAT_TORCH_VERSION ]; then
+  echo "env var KALDIFEAT_TORCH_VERSION is not set, defaults to 1.7.1"
+  KALDIFEAT_TORCH_VERSION=1.7.1
+fi
+
+if [ -z $KALDIFEAT_BUILD_TYPE ]; then
+  echo "env var KALDIFEAT_BUILD_TYPE is not set, defaults to Release"
+  KALDIFEAT_BUILD_TYPE=Release
+fi
+
+export KALDIFEAT_IS_FOR_CONDA=1
+
+# Example value: 3.8
+export KALDIFEAT_PYTHON_VERSION
+
+# Example value: 1.7.1
+export KALDIFEAT_TORCH_VERSION
+
+export KALDIFEAT_BUILD_TYPE
+
+if [ ! -z $KALDIFEAT_IS_GITHUB_ACTIONS ]; then
+  export KALDIFEAT_IS_GITHUB_ACTIONS
+  conda remove -q pytorch
+  conda clean -q -a
+else
+  export KALDIFEAT_IS_GITHUB_ACTIONS=0
+fi
+
+if [ -z $KALDIFEAT_CONDA_TOKEN ]; then
+  echo "Auto upload to anaconda.org is disabled since KALDIFEAT_CONDA_TOKEN is not set"
+  conda build --no-test --no-anaconda-upload -c pytorch -c conda-forge ./scripts/conda-cpu/kaldifeat
+else
+  conda build --no-test -c pytorch -c conda-forge --token $KALDIFEAT_CONDA_TOKEN ./scripts/conda-cpu/kaldifeat
+fi
diff --git a/scripts/conda-cpu/cpuonly/meta.yaml b/scripts/conda-cpu/cpuonly/meta.yaml
new file mode 100644
index 0000000..33ec762
--- /dev/null
+++ b/scripts/conda-cpu/cpuonly/meta.yaml
@@ -0,0 +1,10 @@
+# this file is copied from
+# https://github.com/pytorch/builder/tree/master/conda/cpuonly
+package:
+  name: cpuonly
+  version: 1.0
+
+build:
+  track_features:
+      - cpuonly
+  noarch: generic
diff --git a/scripts/conda-cpu/kaldifeat/build.sh b/scripts/conda-cpu/kaldifeat/build.sh
new file mode 100644
index 0000000..6e24b9d
--- /dev/null
+++ b/scripts/conda-cpu/kaldifeat/build.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+#
+# Copyright      2021  Xiaomi Corp.       (author: Fangjun Kuang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+CONDA_ENV_DIR=$CONDA_PREFIX
+
+echo "KALDIFEAT_PYTHON_VERSION: $KALDIFEAT_PYTHON_VERSION"
+echo "KALDIFEAT_TORCH_VERSION: $KALDIFEAT_TORCH_VERSION"
+echo "KALDIFEAT_BUILD_TYPE: $KALDIFEAT_BUILD_TYPE"
+echo "KALDIFEAT_BUILD_VERSION: $KALDIFEAT_BUILD_VERSION"
+
+export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=${KALDIFEAT_BUILD_TYPE}"
+export KALDIFEAT_MAKE_ARGS="-j1 VERBOSE=1"
+
+export LIBRARY_PATH="/usr/local/miniconda/envs/kaldifeat/lib":$LIBRARY_PATH
+export LD_LIBRARY_PATH="/usr/local/miniconda/envs/kaldifeat/lib":$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH="/usr/local/miniconda/envs/kaldifeat/lib":$DYLD_LIBRARY_PATH
+
+python3 setup.py install --single-version-externally-managed --record=record.txt
diff --git a/scripts/conda-cpu/kaldifeat/meta.yaml b/scripts/conda-cpu/kaldifeat/meta.yaml
new file mode 100644
index 0000000..08fe5e4
--- /dev/null
+++ b/scripts/conda-cpu/kaldifeat/meta.yaml
@@ -0,0 +1,44 @@
+package:
+  name: kaldifeat
+  version: "1.16"
+
+source:
+  path: "{{ environ.get('KALDIFEAT_ROOT_DIR') }}"
+
+build:
+  number: 0
+  string: cpu_py{{ environ.get('KALDIFEAT_PYTHON_VERSION') }}_torch{{ environ.get('KALDIFEAT_TORCH_VERSION') }}
+  script_env:
+    - KALDIFEAT_IS_GITHUB_ACTIONS
+    - KALDIFEAT_TORCH_VERSION
+    - KALDIFEAT_PYTHON_VERSION
+    - KALDIFEAT_BUILD_TYPE
+    - KALDIFEAT_BUILD_VERSION
+    - KALDIFEAT_IS_FOR_CONDA
+
+requirements:
+  build:
+    - {{ compiler('c') }} # [win]
+    - {{ compiler('cxx') }} # [win]
+
+  host:
+    - cmake=3.18
+    - python
+    - pytorch={{ environ.get('KALDIFEAT_TORCH_VERSION') }}
+    - gcc_linux-64=7 # [linux]
+    - cpuonly
+    - numpy
+
+  run:
+    - python
+    - pytorch={{ environ.get('KALDIFEAT_TORCH_VERSION') }}
+    - numpy
+
+about:
+  home: https://github.com/csukuangfj/kaldifeat
+  license: Apache V2
+  license_file: LICENSE
+  summary: Kaldi-compatible feature extraction with PyTorch
+  description: |
+    Kaldi-compatible feature extraction with PyTorch,
+    supporting CUDA, batch processing, chunk processing, and autograd
diff --git a/scripts/conda/kaldifeat/build.sh b/scripts/conda/kaldifeat/build.sh
index 3897511..4539872 100644
--- a/scripts/conda/kaldifeat/build.sh
+++ b/scripts/conda/kaldifeat/build.sh
@@ -32,6 +32,6 @@ echo "gcc version: $($CC --version)"
 echo "nvcc version: $(nvcc --version)"
 
 export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=${KALDIFEAT_BUILD_TYPE}"
-export KALDIFEAT_MAKE_ARGS="-j2"
+export KALDIFEAT_MAKE_ARGS="-j3"
 
 python3 setup.py install --single-version-externally-managed --record=record.txt
diff --git a/scripts/conda/kaldifeat/meta.yaml b/scripts/conda/kaldifeat/meta.yaml
index 6a1b485..4979060 100644
--- a/scripts/conda/kaldifeat/meta.yaml
+++ b/scripts/conda/kaldifeat/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: kaldifeat
-  version: "1.14"
+  version: "1.16"
 
 source:
   path: "{{ environ.get('KALDIFEAT_ROOT_DIR') }}"
diff --git a/scripts/github_actions/generate_build_matrix.py b/scripts/github_actions/generate_build_matrix.py
new file mode 100755
index 0000000..6b85131
--- /dev/null
+++ b/scripts/github_actions/generate_build_matrix.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import argparse
+import json
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--enable-cuda",
+        action="store_true",
+        default=False,
+        help="True to enable CUDA",
+    )
+
+    parser.add_argument(
+        "--for-windows",
+        action="store_true",
+        default=False,
+        help="True for windows",
+    )
+
+    parser.add_argument(
+        "--test-only-latest-torch",
+        action="store_true",
+        default=False,
+        help="""If True, we test only the latest PyTroch
+        to reduce CI running time.""",
+    )
+    return parser.parse_args()
+
+
+def generate_build_matrix(enable_cuda, for_windows, test_only_latest_torch):
+    matrix = {
+        # 1.5.x is removed because there are compilation errors.
+        #  See
+        #  https://github.com/csukuangfj/k2/runs/2533830771?check_suite_focus=true
+        #  and
+        #  https://github.com/NVIDIA/apex/issues/805
+        #  "1.5.0": {
+        #      "python-version": ["3.6", "3.7", "3.8"],
+        #      "cuda": ["10.1", "10.2"],
+        #  },
+        #  "1.5.1": {
+        #      "python-version": ["3.6", "3.7", "3.8"],
+        #      "cuda": ["10.1", "10.2"],
+        #  },
+        "1.6.0": {
+            "python-version": ["3.6", "3.7", "3.8"],
+            "cuda": ["10.1", "10.2"]
+            if not for_windows
+            else ["10.1.243", "10.2.89"],
+        },
+        "1.7.0": {
+            "python-version": ["3.6", "3.7", "3.8"],
+            "cuda": ["10.1", "10.2", "11.0"]
+            if not for_windows
+            else ["10.1.243", "10.2.89", "11.0.3"],
+        },
+        "1.7.1": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.1", "10.2", "11.0"]
+            if not for_windows
+            else ["10.1.243", "10.2.89", "11.0.3"],
+        },
+        "1.8.0": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.1", "10.2", "11.1"]
+            if not for_windows
+            else ["10.1.243", "10.2.89", "11.1.1"],
+        },
+        "1.8.1": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.1", "10.2", "11.1"]
+            if not for_windows
+            else ["10.1.243", "10.2.89", "11.1.1"],
+        },
+        "1.9.0": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.2", "11.1"]
+            if not for_windows
+            else ["10.2.89", "11.1.1"],
+        },
+        "1.9.1": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.2", "11.1"]
+            if not for_windows
+            else ["10.2.89", "11.1.1"],
+        },
+        "1.10.0": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.2", "11.1", "11.3"]
+            if not for_windows
+            else ["10.2.89", "11.1.1", "11.3.1"],
+        },
+        "1.10.1": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.2", "11.1", "11.3"]
+            if not for_windows
+            else ["10.2.89", "11.1.1", "11.3.1"],
+        },
+        "1.10.2": {
+            "python-version": ["3.6", "3.7", "3.8", "3.9"],
+            "cuda": ["10.2", "11.1", "11.3"]
+            if not for_windows
+            else ["10.2.89", "11.1.1", "11.3.1"],
+        },
+        "1.11.0": {
+            "python-version": ["3.7", "3.8", "3.9", "3.10"],
+            "cuda": ["10.2", "11.3", "11.5"]
+            if not for_windows
+            else ["11.3.1", "11.5.2"],
+        },
+    }
+    if test_only_latest_torch:
+        latest = "1.11.0"
+        matrix = {latest: matrix[latest]}
+
+    ans = []
+    for torch, python_cuda in matrix.items():
+        python_versions = python_cuda["python-version"]
+        cuda_versions = python_cuda["cuda"]
+        if enable_cuda:
+            for p in python_versions:
+                for c in cuda_versions:
+                    ans.append({"torch": torch, "python-version": p, "cuda": c})
+        else:
+            for p in python_versions:
+                ans.append({"torch": torch, "python-version": p})
+
+    print(json.dumps({"include": ans}))
+
+
+def main():
+    args = get_args()
+    generate_build_matrix(
+        enable_cuda=args.enable_cuda,
+        for_windows=args.for_windows,
+        test_only_latest_torch=args.test_only_latest_torch,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/github_actions/install_cuda.sh b/scripts/github_actions/install_cuda.sh
index 7d023b9..b84de89 100755
--- a/scripts/github_actions/install_cuda.sh
+++ b/scripts/github_actions/install_cuda.sh
@@ -36,6 +36,13 @@ case "$cuda" in
     # url=https://developer.download.nvidia.com/compute/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run
     url=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
     ;;
+  11.3)
+    # url=https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda_11.3.0_465.19.01_linux.run
+    url=https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+    ;;
+  11.5)
+    url=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
+    ;;
   *)
     echo "Unknown cuda version: $cuda"
     exit 1
diff --git a/scripts/github_actions/install_cudnn.sh b/scripts/github_actions/install_cudnn.sh
index 853eba5..8feafbe 100755
--- a/scripts/github_actions/install_cudnn.sh
+++ b/scripts/github_actions/install_cudnn.sh
@@ -17,42 +17,43 @@
 case $cuda in
   10.0)
     filename=cudnn-10.0-linux-x64-v7.6.5.32.tgz
-    url=http://www.mediafire.com/file/1037lb1vmj9qdtq/cudnn-10.0-linux-x64-v7.6.5.32.tgz/file
     ;;
   10.1)
     filename=cudnn-10.1-linux-x64-v8.0.2.39.tgz
-    url=http://www.mediafire.com/file/fnl2wg0h757qhd7/cudnn-10.1-linux-x64-v8.0.2.39.tgz/file
     ;;
   10.2)
     filename=cudnn-10.2-linux-x64-v8.0.2.39.tgz
-    url=http://www.mediafire.com/file/sc2nvbtyg0f7ien/cudnn-10.2-linux-x64-v8.0.2.39.tgz/file
     ;;
   11.0)
     filename=cudnn-11.0-linux-x64-v8.0.5.39.tgz
-    url=https://www.mediafire.com/file/abyhnls106ko9kp/cudnn-11.0-linux-x64-v8.0.5.39.tgz/file
     ;;
   11.1)
-    filename=cudnn-11.1-linux-x64-v8.0.5.39.tgz
-    url=https://www.mediafire.com/file/qx55zd65773xonv/cudnn-11.1-linux-x64-v8.0.5.39.tgz/file
+    filename=cudnn-11.1-linux-x64-v8.0.4.30.tgz
     ;;
+  11.3)
+    filename=cudnn-11.3-linux-x64-v8.2.0.53.tgz
+    ;;
+  11.5)
+    filename=cudnn-11.3-linux-x64-v8.2.0.53.tgz
+    ;;
+  # 11.5)
+  #   filename=cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz
+  #   ;;
   *)
     echo "Unsupported cuda version: $cuda"
     exit 1
     ;;
 esac
 
-function retry() {
-  $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nPlease install 'git-lfs' first."; exit 2; }
 
-# It is forked from https://github.com/Juvenal-Yescas/mediafire-dl
-# https://github.com/Juvenal-Yescas/mediafire-dl/pull/2 changes the filename and breaks the CI.
-# We use a separate fork to keep the link fixed.
-retry wget https://raw.githubusercontent.com/csukuangfj/mediafire-dl/master/mediafire_dl.py
+git clone https://huggingface.co/csukuangfj/cudnn
+cd cudnn
+git lfs pull --include="$filename"
 
-sed -i 's/quiet=False/quiet=True/' mediafire_dl.py
-retry python3 mediafire_dl.py "$url"
-sudo tar xf ./$filename -C /usr/local
-rm -v ./$filename
+sudo tar xf ./$filename --strip-components=1 -C /usr/local/cuda
+
+# save disk space
+git lfs prune && cd .. && rm -rf cudnn
 
 sudo sed -i '59i#define CUDNN_MAJOR 8' /usr/local/cuda/include/cudnn.h
diff --git a/scripts/github_actions/install_torch.sh b/scripts/github_actions/install_torch.sh
index 3ad1717..ed813c5 100755
--- a/scripts/github_actions/install_torch.sh
+++ b/scripts/github_actions/install_torch.sh
@@ -78,7 +78,7 @@ case ${torch} in
         ;;
     esac
     ;;
-  1.9.0)
+  1.9.*)
     case ${cuda} in
       10.2)
         package="torch==${torch}"
@@ -91,6 +91,40 @@ case ${torch} in
         ;;
     esac
     ;;
+  1.10.*)
+    case ${cuda} in
+      10.2)
+        package="torch==${torch}"
+        # Leave it empty to use PyPI.
+        url=
+        ;;
+      11.1)
+        package="torch==${torch}+cu111"
+        url=https://download.pytorch.org/whl/torch_stable.html
+        ;;
+      11.3)
+        package="torch==${torch}+cu113"
+        url=https://download.pytorch.org/whl/torch_stable.html
+        ;;
+    esac
+    ;;
+  1.11.*)
+    case ${cuda} in
+      10.2)
+        package="torch==${torch}"
+        # Leave it empty to use PyPI.
+        url=
+        ;;
+      11.3)
+        package="torch==${torch}+cu113"
+        url=https://download.pytorch.org/whl/torch_stable.html
+        ;;
+      11.5)
+        package="torch==${torch}+cu115"
+        url=https://download.pytorch.org/whl/torch_stable.html
+        ;;
+    esac
+    ;;
   *)
     echo "Unsupported PyTorch version: ${torch}"
     exit 1
diff --git a/scripts/github_actions/run-nightly-build.py b/scripts/github_actions/run-nightly-build.py
new file mode 100755
index 0000000..1e002fb
--- /dev/null
+++ b/scripts/github_actions/run-nightly-build.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import subprocess
+from datetime import datetime, timedelta
+
+
+def get_last_commit_date() -> datetime:
+    date = (
+        subprocess.check_output(
+            [
+                "git",
+                "log",
+                "-1",
+                "--format=%ad",
+                "--date=unix",
+            ]
+        )
+        .decode("ascii")
+        .strip()
+    )
+    return datetime.utcfromtimestamp(int(date))
+
+
+def main():
+    last_commit_date_utc = get_last_commit_date()
+    now_utc = datetime.utcnow()
+    if last_commit_date_utc + timedelta(days=1) > now_utc:
+        print("true")
+    else:
+        print("false")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index 3436265..d69fb87 100644
--- a/setup.py
+++ b/setup.py
@@ -61,5 +61,7 @@ with open("kaldifeat/python/kaldifeat/__init__.py", "r") as f:
 
 with open("kaldifeat/python/kaldifeat/__init__.py", "w") as f:
     for line in lines:
-        if "__version__" not in line:
-            f.write(line)
+        if "__version__" in line and "torch" not in line:
+            # skip __version__ = "x.x.x"
+            continue
+        f.write(line)