Merge branch 'master' into pruned_aishell

2022-03-21 17:30:57 +08:00 · 2022-03-21 17:30:57 +08:00 · 7a3e88d2d3
commit 7a3e88d2d3
parent a4896fbda6 b2b4d9e0b6
69 changed files with 6030 additions and 1238 deletions
--- a/.flake8
+++ b/.flake8
@ -6,6 +6,7 @@ per-file-ignores =
    # line too long
    egs/librispeech/ASR/*/conformer.py: E501,
    egs/aishell/ASR/*/conformer.py: E501,
    egs/tedlium3/ASR/*/conformer.py: E501,
    # invalid escape sequence (cause by tex formular), W605
    icefall/utils.py: E501, W605
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@ -0,0 +1,180 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-03-12
 # stateless transducer + k2 pruned rnnt-loss
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
 jobs:
  run_librispeech_2022_03_12:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Install graphviz
        shell: bash
        run: |
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
          sudo apt-get -qq install git-lfs
          mkdir -p ~/tmp
          cd ~/tmp
          git lfs install
          git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
      - name: Display test files
        shell: bash
        run: |
          sudo apt-get -qq install tree sox
          tree ~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
          soxi ~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12/test_wavs/*.wav
          ls -lh ~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12/test_wavs/*.wav
      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          dir=~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
          cd egs/librispeech/ASR
          ./pruned_transducer_stateless/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 1 \
            --checkpoint $dir/exp/pretrained.pt \
            --bpe-model $dir/data/lang_bpe_500/bpe.model \
            $dir/test_wavs/1089-134686-0001.wav \
            $dir/test_wavs/1221-135766-0001.wav \
            $dir/test_wavs/1221-135766-0002.wav
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          dir=~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
          cd egs/librispeech/ASR
          ./pruned_transducer_stateless/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 2 \
            --checkpoint $dir/exp/pretrained.pt \
            --bpe-model $dir/data/lang_bpe_500/bpe.model \
            $dir/test_wavs/1089-134686-0001.wav \
            $dir/test_wavs/1221-135766-0001.wav \
            $dir/test_wavs/1221-135766-0002.wav
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          dir=~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
          cd egs/librispeech/ASR
          ./pruned_transducer_stateless/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 3 \
            --checkpoint $dir/exp/pretrained.pt \
            --bpe-model $dir/data/lang_bpe_500/bpe.model \
            $dir/test_wavs/1089-134686-0001.wav \
            $dir/test_wavs/1221-135766-0001.wav \
            $dir/test_wavs/1221-135766-0002.wav
      - name: Run beam search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          dir=~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
          cd egs/librispeech/ASR
          ./pruned_transducer_stateless/pretrained.py \
            --method beam_search \
            --beam-size 4 \
            --checkpoint $dir/exp/pretrained.pt \
            --bpe-model $dir/data/lang_bpe_500/bpe.model \
            $dir/test_wavs/1089-134686-0001.wav \
            $dir/test_wavs/1221-135766-0001.wav \
            $dir/test_wavs/1221-135766-0002.wav
      - name: Run modified beam search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          dir=~/tmp/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
          cd egs/librispeech/ASR
          ./pruned_transducer_stateless/pretrained.py \
            --method modified_beam_search \
            --beam-size 4 \
            --checkpoint $dir/exp/pretrained.pt \
            --bpe-model $dir/data/lang_bpe_500/bpe.model \
            $dir/test_wavs/1089-134686-0001.wav \
            $dir/test_wavs/1221-135766-0001.wav \
            $dir/test_wavs/1221-135766-0002.wav
--- a/.github/workflows/run-pretrained-conformer-ctc.yml
+++ b/.github/workflows/run-pretrained-conformer-ctc.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -42,30 +39,43 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
@ -83,7 +93,9 @@ jobs:
      - name: Run CTC decoding
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./conformer_ctc/pretrained.py \
            --num-classes 500 \
@ -98,6 +110,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./conformer_ctc/pretrained.py \
            --num-classes 500 \
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -42,30 +39,43 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
@ -84,7 +94,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
@ -98,7 +110,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
@ -112,7 +126,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
@ -127,6 +143,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method beam_search \
@ -141,6 +159,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method modified_beam_search \
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -42,30 +39,43 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
@ -85,7 +95,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
@ -99,7 +111,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
@ -113,7 +127,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
@ -128,6 +144,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method beam_search \
@ -143,6 +161,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method modified_beam_search \
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -42,30 +39,43 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
@ -84,7 +94,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified-2/pretrained.py \
            --method greedy_search \
@ -98,7 +110,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified-2/pretrained.py \
            --method greedy_search \
@ -112,7 +126,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified-2/pretrained.py \
            --method greedy_search \
@ -127,6 +143,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified-2/pretrained.py \
            --method beam_search \
@ -142,6 +160,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified-2/pretrained.py \
            --method modified_beam_search \
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -42,30 +39,43 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
@ -84,7 +94,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified/pretrained.py \
            --method greedy_search \
@ -98,7 +110,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified/pretrained.py \
            --method greedy_search \
@ -112,7 +126,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified/pretrained.py \
            --method greedy_search \
@ -127,6 +143,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified/pretrained.py \
            --method beam_search \
@ -142,6 +160,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/aishell/ASR
          ./transducer_stateless_modified/pretrained.py \
            --method modified_beam_search \
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -42,30 +39,43 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
@ -83,7 +93,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method greedy_search \
@ -97,7 +109,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method greedy_search \
@ -111,7 +125,9 @@ jobs:
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method greedy_search \
@ -126,6 +142,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method beam_search \
@ -140,6 +158,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method modified_beam_search \
--- a/.github/workflows/run-pretrained-transducer.yml
+++ b/.github/workflows/run-pretrained-transducer.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -42,30 +39,43 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          mkdir -p ~/tmp
          cd ~/tmp
          git clone https://github.com/csukuangfj/kaldifeat
          cd kaldifeat
          mkdir build
          cd build
          cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j2 _kaldifeat
      - name: Download pre-trained model
        shell: bash
        run: |
@ -84,7 +94,9 @@ jobs:
      - name: Run greedy search decoding
        shell: bash
        run: |
-          export PYTHONPATH=$PWD:PYTHONPATH
+          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer/pretrained.py \
            --method greedy_search \
@ -98,6 +110,8 @@ jobs:
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer/pretrained.py \
            --method beam_search \
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@ -33,9 +33,6 @@ jobs:
        # TODO: enable macOS for CPU testing
        os: [ubuntu-18.04]
        python-version: [3.8]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
    steps:
@ -43,10 +40,17 @@ jobs:
        with:
          fetch-depth: 0
      - name: Install graphviz
        shell: bash
        run: |
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install libnsdfile and libsox
        if: startsWith(matrix.os, 'ubuntu')
@ -57,13 +61,7 @@ jobs:
      - name: Install Python dependencies
        run: |
-          python3 -m pip install -U pip
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          # We are in ./icefall and there is a file: requirements.txt in it
          python3 -m pip install -r requirements.txt
      - name: Run yesno recipe
        shell: bash
--- a/README.md
+++ b/README.md
@ -84,7 +84,7 @@ The best WER using modified beam search with beam size 4 is:
 |     | test-clean | test-other |
 |-----|------------|------------|
-| WER | 2.61       | 6.46       |
+| WER | 2.56       | 6.27       |
 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -1,4 +1,5 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -16,6 +17,7 @@
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
@ -210,10 +212,20 @@ class AishellAsrDataModule:
            logging.info(
                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
            )
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=2,
+                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
--- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
@ -1,5 +1,6 @@
 # Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#                2022  Xiaomi Corp.        (authors: Fangjun Kuang
 #                                                    Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -16,6 +17,7 @@
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from pathlib import Path
 from typing import Optional
@ -180,10 +182,20 @@ class AsrDataModule:
            logging.info(
                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
            )
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=2,
+                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
--- a/egs/librispeech/ASR/README.md
+++ b/egs/librispeech/ASR/README.md
@ -15,6 +15,7 @@ The following table lists the differences among them.
 | `transducer_stateless`                | Conformer | Embedding + Conv1d |                                                   |
 | `transducer_lstm`                     | LSTM      | LSTM               |                                                   |
 | `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data |
 | `pruned_transducer_stateless`         | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss                        |
 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@ -2,12 +2,111 @@
 ### LibriSpeech BPE training results (Pruned Transducer)
 #### Conformer encoder + embedding decoder
 Conformer encoder + non-current decoder. The decoder
 contains only an embedding layer, a Conv1d (with kernel size 2) and a linear
 layer (to transform tensor dim).
 #### 2022-03-12
 [pruned_transducer_stateless](./pruned_transducer_stateless)
 Using commit `1603744469d167d848e074f2ea98c587153205fa`.
 See <https://github.com/k2-fsa/icefall/pull/248>
 The WERs are:
 |                                     | test-clean | test-other | comment                                  |
 |-------------------------------------|------------|------------|------------------------------------------|
 | greedy search (max sym per frame 1) | 2.62       | 6.37       | --epoch 42, --avg 11, --max-duration 100 |
 | greedy search (max sym per frame 2) | 2.62       | 6.37       | --epoch 42, --avg 11, --max-duration 100 |
 | greedy search (max sym per frame 3) | 2.62       | 6.37       | --epoch 42, --avg 11, --max-duration 100 |
 | modified beam search (beam size 4)  | 2.56       | 6.27       | --epoch 42, --avg 11, --max-duration 100 |
 | beam search (beam size 4)           | 2.57       | 6.27       | --epoch 42, --avg 11, --max-duration 100 |
 The decoding time for `test-clean` and `test-other` is given below:
 (A V100 GPU with 32 GB RAM is used for decoding. Note: Not all GPU RAM is used during decoding.)
 | decoding method | test-clean (seconds) | test-other (seconds)|
 |---|---:|---:|
 | greedy search (--max-sym-per-frame=1) | 160 | 159 |
 | greedy search (--max-sym-per-frame=2) | 184 | 177 |
 | greedy search (--max-sym-per-frame=3) | 210 | 213 |
 | modified beam search (--beam-size 4)| 273 | 269 |
 |beam search (--beam-size 4) | 2741 | 2221 |
 We recommend you to use `modified_beam_search`.
 Training command:
 ```bash
 cd egs/librispeech/ASR/
 ./prepare.sh
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 . path.sh
 ./pruned_transducer_stateless/train.py \
  --world-size 8 \
  --num-epochs 60 \
  --start-epoch 0 \
  --exp-dir pruned_transducer_stateless/exp \
  --full-libri 1 \
  --max-duration 300 \
  --prune-range 5 \
  --lr-factor 5 \
  --lm-scale 0.25
 ```
 The tensorboard training log can be found at
 <https://tensorboard.dev/experiment/WKRFY5fYSzaVBHahenpNlA/>
 The command for decoding is:
 ```bash
 epoch=42
 avg=11
 sym=1
 # greedy search
 ./pruned_transducer_stateless/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir ./pruned_transducer_stateless/exp \
  --max-duration 100 \
  --decoding-method greedy_search \
  --beam-size 4 \
  --max-sym-per-frame $sym
 # modified beam search
 ./pruned_transducer_stateless/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir ./pruned_transducer_stateless/exp \
  --max-duration 100 \
  --decoding-method modified_beam_search \
  --beam-size 4
 # beam search
 # (not recommended)
 ./pruned_transducer_stateless/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir ./pruned_transducer_stateless/exp \
  --max-duration 100 \
  --decoding-method beam_search \
  --beam-size 4
 ```
 You can find a pre-trained model, decoding logs, and decoding results at
 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>
 #### 2022-02-18
 [pruned_transducer_stateless](./pruned_transducer_stateless)
 The WERs are
 |                           | test-clean | test-other | comment                                  |
@ -62,7 +161,7 @@ See
 ##### 2022-03-01
-Using commit `fill in it after merging`.
+Using commit `2332ba312d7ce72f08c7bac1e3312f7e3dd722dc`.
 It uses [GigaSpeech](https://github.com/SpeechColab/GigaSpeech)
 as extra training data. 20% of the time it selects a batch from L subset of
@ -129,6 +228,9 @@ sym=1
  --beam-size 4
 ```
 You can find a pretrained model by visiting
 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01>
 ##### 2022-02-07
--- a/egs/librispeech/ASR/conformer_mmi/asr_datamodule.py
+++ b/egs/librispeech/ASR/conformer_mmi/asr_datamodule.py
@ -1,356 +0,0 @@
 # Copyright      2021  Piotr Żelasko
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import List, Union
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.dataset.datamodule import DataModule
 from icefall.utils import str2bool
 class LibriSpeechAsrDataModule(DataModule):
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        super().add_arguments(parser)
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--full-libri",
            type=str2bool,
            default=True,
            help="When enabled, use 960h LibriSpeech. "
            "Otherwise, use 100h subset.",
        )
        group.add_argument(
            "--feature-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the BucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
    def train_dataloaders(self) -> DataLoader:
        logging.info("About to get train cuts")
        cuts_train = self.train_cuts()
        logging.info("About to get Musan cuts")
        cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
        logging.info("About to create train dataset")
        transforms = [
            CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
        ]
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        input_transforms = [
            SpecAugment(
                num_frame_masks=2,
                features_mask_size=27,
                num_feature_masks=2,
                frames_mask_size=100,
            )
        ]
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                input_transforms=input_transforms,
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using BucketingSampler.")
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
            logging.info("Using SingleCutSampler.")
            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
        )
        return train_dl
    def valid_dataloaders(self) -> DataLoader:
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        cuts = self.test_cuts()
        is_list = isinstance(cuts, list)
        test_loaders = []
        if not is_list:
            cuts = [cuts]
        for cuts_test in cuts:
            logging.debug("About to create test dataset")
            test = K2SpeechRecognitionDataset(
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                )
                if self.args.on_the_fly_feats
                else PrecomputedFeatures(),
                return_cuts=self.args.return_cuts,
            )
            sampler = SingleCutSampler(
                cuts_test, max_duration=self.args.max_duration
            )
            logging.debug("About to create test dataloader")
            test_dl = DataLoader(
                test, batch_size=None, sampler=sampler, num_workers=1
            )
            test_loaders.append(test_dl)
        if is_list:
            return test_loaders
        else:
            return test_loaders[0]
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        cuts_train = load_manifest(
            self.args.feature_dir / "cuts_train-clean-100.json.gz"
        )
        if self.args.full_libri:
            cuts_train = (
                cuts_train
                + load_manifest(
                    self.args.feature_dir / "cuts_train-clean-360.json.gz"
                )
                + load_manifest(
                    self.args.feature_dir / "cuts_train-other-500.json.gz"
                )
            )
        return cuts_train
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        cuts_valid = load_manifest(
            self.args.feature_dir / "cuts_dev-clean.json.gz"
        ) + load_manifest(self.args.feature_dir / "cuts_dev-other.json.gz")
        return cuts_valid
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        test_sets = ["test-clean", "test-other"]
        cuts = []
        for test_set in test_sets:
            logging.debug("About to get test cuts")
            cuts.append(
                load_manifest(
                    self.args.feature_dir / f"cuts_{test_set}.json.gz"
                )
            )
        return cuts
--- a/egs/librispeech/ASR/conformer_mmi/asr_datamodule.py
+++ b/egs/librispeech/ASR/conformer_mmi/asr_datamodule.py
@ -0,0 +1 @@
 ../conformer_ctc/asr_datamodule.py
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -60,8 +60,11 @@ log "dl_dir: $dl_dir"
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  log "Stage -1: Download LM"
-  [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
+  mkdir -p $dl_dir/lm
-  ./local/download_lm.py --out-dir=$dl_dir/lm
+  if [ ! -e $dl_dir/lm/.done ]; then
    ./local/download_lm.py --out-dir=$dl_dir/lm
    touch $dl_dir/lm/.done
  fi
 fi
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
@ -91,7 +94,10 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  # We assume that you have downloaded the LibriSpeech corpus
  # to $dl_dir/LibriSpeech
  mkdir -p data/manifests
-  lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests
+  if [ ! -e data/manifests/.librispeech.done ]; then
    lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests
    touch data/manifests/.librispeech.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
@ -99,19 +105,28 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  # We assume that you have downloaded the musan corpus
  # to data/musan
  mkdir -p data/manifests
-  lhotse prepare musan $dl_dir/musan data/manifests
+  if [ ! -e data/manifests/.musan.done ]; then
    lhotse prepare musan $dl_dir/musan data/manifests
    touch data/manifests/.musan.done
  fi
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Compute fbank for librispeech"
  mkdir -p data/fbank
-  ./local/compute_fbank_librispeech.py
+  if [ ! -e data/fbank/.librispeech.done ]; then
    ./local/compute_fbank_librispeech.py
    touch data/fbank/.librispeech.done
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  mkdir -p data/fbank
-  ./local/compute_fbank_musan.py
+  if [ ! -e data/fbank/.musan.done ]; then
    ./local/compute_fbank_musan.py
    touch data/fbank/.musan.done
  fi
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
--- a/egs/librispeech/ASR/pruned_transducer_stateless/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/beam_search.py
@ -17,10 +17,91 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional
-import numpy as np
+import k2
 import torch
 from model import Transducer
 from icefall.decode import one_best_decoding
 from icefall.utils import get_texts
 def fast_beam_search(
    model: Transducer,
    decoding_graph: k2.Fsa,
    encoder_out: torch.Tensor,
    encoder_out_lens: torch.Tensor,
    beam: float,
    max_states: int,
    max_contexts: int,
 ) -> List[List[int]]:
    """It limits the maximum number of symbols per frame to 1.
    Args:
      model:
        An instance of `Transducer`.
      decoding_graph:
        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder.
      encoder_out_lens:
        A tensor of shape (N,) containing the number of frames in `encoder_out`
        before padding.
      beam:
        Beam value, similar to the beam used in Kaldi..
      max_states:
        Max states per stream per frame.
      max_contexts:
        Max contexts pre stream per frame.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3
    context_size = model.decoder.context_size
    vocab_size = model.decoder.vocab_size
    B, T, C = encoder_out.shape
    config = k2.RnntDecodingConfig(
        vocab_size=vocab_size,
        decoder_history_len=context_size,
        beam=beam,
        max_contexts=max_contexts,
        max_states=max_states,
    )
    individual_streams = []
    for i in range(B):
        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
    for t in range(T):
        # shape is a RaggedShape of shape (B, context)
        # contexts is a Tensor of shape (shape.NumElements(), context_size)
        shape, contexts = decoding_streams.get_contexts()
        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
        contexts = contexts.to(torch.int64)
        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
        decoder_out = model.decoder(contexts, need_pad=False)
        # current_encoder_out is of shape
        # (shape.NumElements(), 1, encoder_out_dim)
        # fmt: off
        current_encoder_out = torch.index_select(
            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1)
        )
        # fmt: on
        logits = model.joiner(
            current_encoder_out.unsqueeze(2), decoder_out.unsqueeze(1)
        )
        logits = logits.squeeze(1).squeeze(1)
        log_probs = logits.log_softmax(dim=-1)
        decoding_streams.advance(log_probs)
    decoding_streams.terminate_and_flush_to_streams()
    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
    best_path = one_best_decoding(lattice)
    hyps = get_texts(best_path)
    return hyps
 def greedy_search(
    model: Transducer, encoder_out: torch.Tensor, max_sym_per_frame: int
@ -48,7 +129,7 @@ def greedy_search(
    device = model.device
    decoder_input = torch.tensor(
-        [blank_id] * context_size, device=device
+        [blank_id] * context_size, device=device, dtype=torch.int64
    ).reshape(1, context_size)
    decoder_out = model.decoder(decoder_input, need_pad=False)
@ -103,8 +184,9 @@ class Hypothesis:
    # Newly predicted tokens are appended to `ys`.
    ys: List[int]
-    # The log prob of ys
+    # The log prob of ys.
-    log_prob: float
+    # It contains only one entry.
    log_prob: torch.Tensor
    @property
    def key(self) -> str:
@ -113,7 +195,7 @@ class Hypothesis:
 class HypothesisList(object):
-    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None):
+    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
        """
        Args:
          data:
@ -125,10 +207,10 @@ class HypothesisList(object):
            self._data = data
    @property
-    def data(self):
+    def data(self) -> Dict[str, Hypothesis]:
        return self._data
-    def add(self, hyp: Hypothesis):
+    def add(self, hyp: Hypothesis) -> None:
        """Add a Hypothesis to `self`.
        If `hyp` already exists in `self`, its probability is updated using
@ -140,8 +222,10 @@ class HypothesisList(object):
        """
        key = hyp.key
        if key in self:
-            old_hyp = self._data[key]
+            old_hyp = self._data[key]  # shallow copy
-            old_hyp.log_prob = np.logaddexp(old_hyp.log_prob, hyp.log_prob)
+            torch.logaddexp(
                old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob
            )
        else:
            self._data[key] = hyp
@ -153,7 +237,8 @@ class HypothesisList(object):
          length_norm:
            If True, the `log_prob` of a hypothesis is normalized by the
            number of tokens in it.
-
+        Returns:
          Return the hypothesis that has the largest `log_prob`.
        """
        if length_norm:
            return max(
@ -165,6 +250,9 @@ class HypothesisList(object):
    def remove(self, hyp: Hypothesis) -> None:
        """Remove a given hypothesis.
        Caution:
          `self` is modified **in-place**.
        Args:
          hyp:
            The hypothesis to be removed from `self`.
@ -175,7 +263,7 @@ class HypothesisList(object):
        assert key in self, f"{key} does not exist"
        del self._data[key]
-    def filter(self, threshold: float) -> "HypothesisList":
+    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
        """Remove all Hypotheses whose log_prob is less than threshold.
        Caution:
@ -183,10 +271,10 @@ class HypothesisList(object):
        Returns:
          Return a new HypothesisList containing all hypotheses from `self`
-          that have `log_prob` being greater than the given `threshold`.
+          with `log_prob` being greater than the given `threshold`.
        """
        ans = HypothesisList()
-        for key, hyp in self._data.items():
+        for _, hyp in self._data.items():
            if hyp.log_prob > threshold:
                ans.add(hyp)  # shallow copy
        return ans
@ -216,6 +304,106 @@ class HypothesisList(object):
        return ", ".join(s)
 def modified_beam_search(
    model: Transducer,
    encoder_out: torch.Tensor,
    beam: int = 4,
 ) -> List[int]:
    """It limits the maximum number of symbols per frame to 1.
    Args:
      model:
        An instance of `Transducer`.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
      beam:
        Beam size.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3
    # support only batch_size == 1 for now
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    blank_id = model.decoder.blank_id
    context_size = model.decoder.context_size
    device = model.device
    T = encoder_out.size(1)
    B = HypothesisList()
    B.add(
        Hypothesis(
            ys=[blank_id] * context_size,
            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
        )
    )
    for t in range(T):
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
        # current_encoder_out is of shape (1, 1, 1, encoder_out_dim)
        # fmt: on
        A = list(B)
        B = HypothesisList()
        ys_log_probs = torch.cat([hyp.log_prob.reshape(1, 1) for hyp in A])
        # ys_log_probs is of shape (num_hyps, 1)
        decoder_input = torch.tensor(
            [hyp.ys[-context_size:] for hyp in A],
            device=device,
            dtype=torch.int64,
        )
        # decoder_input is of shape (num_hyps, context_size)
        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
        # decoder_output is of shape (num_hyps, 1, 1, decoder_output_dim)
        current_encoder_out = current_encoder_out.expand(
            decoder_out.size(0), 1, 1, -1
        )  # (num_hyps, 1, 1, encoder_out_dim)
        logits = model.joiner(
            current_encoder_out,
            decoder_out,
        )
        # logits is of shape (num_hyps, 1, 1, vocab_size)
        logits = logits.squeeze(1).squeeze(1)
        # now logits is of shape (num_hyps, vocab_size)
        log_probs = logits.log_softmax(dim=-1)
        log_probs.add_(ys_log_probs)
        log_probs = log_probs.reshape(-1)
        topk_log_probs, topk_indexes = log_probs.topk(beam)
        # topk_hyp_indexes are indexes into `A`
        topk_hyp_indexes = topk_indexes // logits.size(-1)
        topk_token_indexes = topk_indexes % logits.size(-1)
        topk_hyp_indexes = topk_hyp_indexes.tolist()
        topk_token_indexes = topk_token_indexes.tolist()
        for i in range(len(topk_hyp_indexes)):
            hyp = A[topk_hyp_indexes[i]]
            new_ys = hyp.ys[:]
            new_token = topk_token_indexes[i]
            if new_token != blank_id:
                new_ys.append(new_token)
            new_log_prob = topk_log_probs[i]
            new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
            B.add(new_hyp)
    best_hyp = B.get_most_probable(length_norm=True)
    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
    return ys
 def beam_search(
    model: Transducer,
    encoder_out: torch.Tensor,
@ -246,7 +434,9 @@ def beam_search(
    device = model.device
    decoder_input = torch.tensor(
-        [blank_id] * context_size, device=device
+        [blank_id] * context_size,
        device=device,
        dtype=torch.int64,
    ).reshape(1, context_size)
    decoder_out = model.decoder(decoder_input, need_pad=False)
@ -283,7 +473,9 @@ def beam_search(
            if cached_key not in decoder_cache:
                decoder_input = torch.tensor(
-                    [y_star.ys[-context_size:]], device=device
+                    [y_star.ys[-context_size:]],
                    device=device,
                    dtype=torch.int64,
                ).reshape(1, context_size)
                decoder_out = model.decoder(decoder_input, need_pad=False)
@ -297,7 +489,7 @@ def beam_search(
                    current_encoder_out, decoder_out.unsqueeze(1)
                )
-                # TODO(fangjun): Cache the blank posterior
+                # TODO(fangjun): Scale the blank posterior
                log_prob = logits.log_softmax(dim=-1)
                # log_prob is (1, 1, 1, vocab_size)
@ -309,7 +501,7 @@ def beam_search(
            # First, process the blank symbol
            skip_log_prob = log_prob[blank_id]
-            new_y_star_log_prob = y_star.log_prob + skip_log_prob.item()
+            new_y_star_log_prob = y_star.log_prob + skip_log_prob
            # ys[:] returns a copy of ys
            B.add(Hypothesis(ys=y_star.ys[:], log_prob=new_y_star_log_prob))
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
@ -33,6 +33,26 @@ Usage:
        --max-duration 100 \
        --decoding-method beam_search \
        --beam-size 4
 (3) modified beam search
 ./pruned_transducer_stateless/decode.py \
        --epoch 28 \
        --avg 15 \
        --exp-dir ./pruned_transducer_stateless/exp \
        --max-duration 100 \
        --decoding-method modified_beam_search \
        --beam-size 4
 (4) fast beam search
 ./pruned_transducer_stateless/decode.py \
        --epoch 28 \
        --avg 15 \
        --exp-dir ./pruned_transducer_stateless/exp \
        --max-duration 1500 \
        --decoding-method fast_beam_search \
        --beam 4 \
        --max-contexts 4 \
        --max-states 8
 """
@ -40,20 +60,26 @@ import argparse
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-from beam_search import beam_search, greedy_search
+from beam_search import (
-from conformer import Conformer
+    beam_search,
-from decoder import Decoder
+    fast_beam_search,
-from joiner import Joiner
+    greedy_search,
-from model import Transducer
+    modified_beam_search,
 )
 from train import get_params, get_transducer_model
-from icefall.checkpoint import average_checkpoints, load_checkpoint
+from icefall.checkpoint import (
-from icefall.env import get_env_info
+    average_checkpoints,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import (
    AttributeDict,
    setup_logger,
@ -83,6 +109,17 @@ def get_parser():
        "'--epoch'. ",
    )
    parser.add_argument(
        "--avg-last-n",
        type=int,
        default=0,
        help="""If positive, --epoch and --avg are ignored and it
        will use the last n checkpoints exp_dir/checkpoint-xxx.pt
        where xxx is the number of processed batches while
        saving that checkpoint.
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
@ -104,6 +141,8 @@ def get_parser():
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
          - fast_beam_search
        """,
    )
@ -111,7 +150,35 @@ def get_parser():
        "--beam-size",
        type=int,
        default=4,
-        help="Used only when --decoding-method is beam_search",
+        help="""An interger indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is beam_search or
        modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=4,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=8,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
@ -125,83 +192,19 @@ def get_parser():
        "--max-sym-per-frame",
        type=int,
        default=3,
-        help="Maximum number of symbols per frame",
+        help="""Maximum number of symbols per frame.
        Used only when --decoding_method is greedy_search""",
    )
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            # parameters for conformer
            "feature_dim": 80,
            "subsampling_factor": 4,
            "attention_dim": 512,
            "nhead": 8,
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            # parameters for decoder
            "embedding_dim": 512,
            "env_info": get_env_info(),
        }
    )
    return params
 def get_encoder_model(params: AttributeDict) -> nn.Module:
    # TODO: We can add an option to switch between Conformer and Transformer
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.vocab_size,
        subsampling_factor=params.subsampling_factor,
        d_model=params.attention_dim,
        nhead=params.nhead,
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
    )
    return encoder
 def get_decoder_model(params: AttributeDict) -> nn.Module:
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.embedding_dim,
        blank_id=params.blank_id,
        context_size=params.context_size,
    )
    return decoder
 def get_joiner_model(params: AttributeDict) -> nn.Module:
    joiner = Joiner(
        input_dim=params.vocab_size,
        inner_dim=params.embedding_dim,
        output_dim=params.vocab_size,
    )
    return joiner
 def get_transducer_model(params: AttributeDict) -> nn.Module:
    encoder = get_encoder_model(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder=encoder,
        decoder=decoder,
        joiner=joiner,
    )
    return model
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    batch: dict,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
    """Decode one batch and return the result in a dict. The dict has the
    following format:
@ -224,6 +227,9 @@ def decode_one_batch(
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
@ -242,32 +248,62 @@ def decode_one_batch(
        x=feature, x_lens=feature_lens
    )
    hyps = []
    batch_size = encoder_out.size(0)
-    for i in range(batch_size):
+    if params.decoding_method == "fast_beam_search":
-        # fmt: off
+        hyp_tokens = fast_beam_search(
-        encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            model=model,
-        # fmt: on
+            decoding_graph=decoding_graph,
-        if params.decoding_method == "greedy_search":
+            encoder_out=encoder_out,
-            hyp = greedy_search(
+            encoder_out_lens=encoder_out_lens,
-                model=model,
+            beam=params.beam,
-                encoder_out=encoder_out_i,
+            max_contexts=params.max_contexts,
-                max_sym_per_frame=params.max_sym_per_frame,
+            max_states=params.max_states,
-            )
+        )
-        elif params.decoding_method == "beam_search":
+        for hyp in sp.decode(hyp_tokens):
-            hyp = beam_search(
+            hyps.append(hyp.split())
-                model=model, encoder_out=encoder_out_i, beam=params.beam_size
+    else:
-            )
+        batch_size = encoder_out.size(0)
-        else:
+
-            raise ValueError(
+        for i in range(batch_size):
-                f"Unsupported decoding method: {params.decoding_method}"
+            # fmt: off
-            )
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-        hyps.append(sp.decode(hyp).split())
+            # fmt: on
            if params.decoding_method == "greedy_search":
                hyp = greedy_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    max_sym_per_frame=params.max_sym_per_frame,
                )
            elif params.decoding_method == "beam_search":
                hyp = beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            elif params.decoding_method == "modified_beam_search":
                hyp = modified_beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            else:
                raise ValueError(
                    f"Unsupported decoding method: {params.decoding_method}"
                )
            hyps.append(sp.decode(hyp).split())
    if params.decoding_method == "greedy_search":
        return {"greedy_search": hyps}
    elif params.decoding_method == "fast_beam_search":
        return {
            (
                f"beam_{params.beam}_"
                f"max_contexts_{params.max_contexts}_"
                f"max_states_{params.max_states}"
            ): hyps
        }
    else:
-        return {f"beam_{params.beam_size}": hyps}
+        return {f"beam_size_{params.beam_size}": hyps}
 def decode_dataset(
@ -275,6 +311,7 @@ def decode_dataset(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
@ -287,6 +324,9 @@ def decode_dataset(
        The neural model.
      sp:
        The BPE model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
@ -314,6 +354,7 @@ def decode_dataset(
            params=params,
            model=model,
            sp=sp,
            decoding_graph=decoding_graph,
            batch=batch,
        )
@ -391,11 +432,20 @@ def main():
    params = get_params()
    params.update(vars(args))
-    assert params.decoding_method in ("greedy_search", "beam_search")
+    assert params.decoding_method in (
        "greedy_search",
        "beam_search",
        "fast_beam_search",
        "modified_beam_search",
    )
    params.res_dir = params.exp_dir / params.decoding_method
    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if params.decoding_method == "beam_search":
+    if "fast_beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
    elif "beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam_size}"
    else:
        params.suffix += f"-context-{params.context_size}"
@ -422,7 +472,12 @@ def main():
    logging.info("About to create model")
    model = get_transducer_model(params)
-    if params.avg == 1:
+    if params.avg_last_n > 0:
        filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n]
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
    elif params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
@ -438,6 +493,11 @@ def main():
    model.eval()
    model.device = device
    if params.decoding_method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    else:
        decoding_graph = None
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
@ -458,6 +518,7 @@ def main():
            params=params,
            model=model,
            sp=sp,
            decoding_graph=decoding_graph,
        )
        save_results(
@ -469,8 +530,5 @@ def main():
    logging.info("Done!")
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decoder.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decoder.py
@ -61,6 +61,7 @@ class Decoder(nn.Module):
        assert context_size >= 1, context_size
        self.context_size = context_size
        self.vocab_size = vocab_size
        if context_size > 1:
            self.conv = nn.Conv1d(
                in_channels=embedding_dim,
--- a/egs/librispeech/ASR/pruned_transducer_stateless/export.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/export.py
@ -39,7 +39,7 @@ you can do:
        --exp-dir ./pruned_transducer_stateless/exp \
        --epoch 9999 \
        --avg 1 \
-        --max-duration 1 \
+        --max-duration 100 \
        --bpe-model data/lang_bpe_500/bpe.model
 """
@ -49,15 +49,10 @@ from pathlib import Path
 import sentencepiece as spm
 import torch
-import torch.nn as nn
+from train import get_params, get_transducer_model
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
 from model import Transducer
 from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.env import get_env_info
+from icefall.utils import str2bool
 from icefall.utils import AttributeDict, str2bool
 def get_parser():
@ -117,71 +112,6 @@ def get_parser():
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            # parameters for conformer
            "feature_dim": 80,
            "subsampling_factor": 4,
            "attention_dim": 512,
            "nhead": 8,
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            # parameters for decoder
            "embedding_dim": 512,
            "env_info": get_env_info(),
        }
    )
    return params
 def get_encoder_model(params: AttributeDict) -> nn.Module:
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.vocab_size,
        subsampling_factor=params.subsampling_factor,
        d_model=params.attention_dim,
        nhead=params.nhead,
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
    )
    return encoder
 def get_decoder_model(params: AttributeDict) -> nn.Module:
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.embedding_dim,
        blank_id=params.blank_id,
        context_size=params.context_size,
    )
    return decoder
 def get_joiner_model(params: AttributeDict) -> nn.Module:
    joiner = Joiner(
        input_dim=params.vocab_size,
        inner_dim=params.embedding_dim,
        output_dim=params.vocab_size,
    )
    return joiner
 def get_transducer_model(params: AttributeDict) -> nn.Module:
    encoder = get_encoder_model(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder=encoder,
        decoder=decoder,
        joiner=joiner,
    )
    return model
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
--- a/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py
@ -49,17 +49,10 @@ from typing import List
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 import torchaudio
-from beam_search import beam_search, greedy_search
+from beam_search import beam_search, greedy_search, modified_beam_search
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
 from model import Transducer
 from torch.nn.utils.rnn import pad_sequence
-
+from train import get_params, get_transducer_model
 from icefall.env import get_env_info
 from icefall.utils import AttributeDict
 def get_parser():
@ -91,6 +84,7 @@ def get_parser():
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
        """,
    )
@ -104,11 +98,18 @@ def get_parser():
        "The sample rate has to be 16kHz.",
    )
    parser.add_argument(
        "--sample-rate",
        type=int,
        default=16000,
        help="The sample rate of the input sound file",
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
-        help="Used only when --method is beam_search",
+        help="Used only when --method is beam_search and modified_beam_search",
    )
    parser.add_argument(
@ -130,72 +131,6 @@ def get_parser():
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            "sample_rate": 16000,
            # parameters for conformer
            "feature_dim": 80,
            "subsampling_factor": 4,
            "attention_dim": 512,
            "nhead": 8,
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            # parameters for decoder
            "embedding_dim": 512,
            "env_info": get_env_info(),
        }
    )
    return params
 def get_encoder_model(params: AttributeDict) -> nn.Module:
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.vocab_size,
        subsampling_factor=params.subsampling_factor,
        d_model=params.attention_dim,
        nhead=params.nhead,
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
    )
    return encoder
 def get_decoder_model(params: AttributeDict) -> nn.Module:
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.embedding_dim,
        blank_id=params.blank_id,
        context_size=params.context_size,
    )
    return decoder
 def get_joiner_model(params: AttributeDict) -> nn.Module:
    joiner = Joiner(
        input_dim=params.vocab_size,
        inner_dim=params.embedding_dim,
        output_dim=params.vocab_size,
    )
    return joiner
 def get_transducer_model(params: AttributeDict) -> nn.Module:
    encoder = get_encoder_model(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder=encoder,
        decoder=decoder,
        joiner=joiner,
    )
    return model
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
@ -220,6 +155,7 @@ def read_sound_files(
    return ans
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
@ -278,10 +214,9 @@ def main():
    feature_lengths = torch.tensor(feature_lengths, device=device)
-    with torch.no_grad():
+    encoder_out, encoder_out_lens = model.encoder(
-        encoder_out, encoder_out_lens = model.encoder(
+        x=features, x_lens=feature_lengths
-            x=features, x_lens=feature_lengths
+    )
        )
    num_waves = encoder_out.size(0)
    hyps = []
@ -303,6 +238,10 @@ def main():
            hyp = beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        elif params.method == "modified_beam_search":
            hyp = modified_beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        else:
            raise ValueError(f"Unsupported method: {params.method}")
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@ -35,7 +35,7 @@ import argparse
 import logging
 from pathlib import Path
 from shutil import copyfile
-from typing import Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 import k2
 import sentencepiece as spm
@ -47,6 +47,7 @@ from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from model import Transducer
 from torch import Tensor
@ -55,8 +56,9 @@ from torch.nn.utils import clip_grad_norm_
 from torch.utils.tensorboard import SummaryWriter
 from transformer import Noam
-from icefall.checkpoint import load_checkpoint
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.utils import (
@ -113,6 +115,15 @@ def get_parser():
        """,
    )
    parser.add_argument(
        "--start-batch",
        type=int,
        default=0,
        help="""If positive, --start-epoch is ignored and
        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
@ -186,6 +197,30 @@ def get_parser():
        help="The seed for random generators intended for reproducibility",
    )
    parser.add_argument(
        "--save-every-n",
        type=int,
        default=8000,
        help="""Save checkpoint after processing this number of batches"
        periodically. We save checkpoint to exp-dir/ whenever
        params.batch_idx_train % save_every_n == 0. The checkpoint filename
        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
        end of each epoch where `xxx` is the epoch number counting from 0.
        """,
    )
    parser.add_argument(
        "--keep-last-k",
        type=int,
        default=20,
        help="""Only keep this number of checkpoints on disk.
        For instance, if it is 3, there are only 3 checkpoints
        in the exp-dir with filenames `checkpoint-xxx.pt`.
        It does not affect checkpoints with name `epoch-xxx.pt`.
        """,
    )
    return parser
@ -314,15 +349,16 @@ def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+) -> Optional[Dict[str, Any]]:
 ) -> None:
    """Load checkpoint from file.
-    If params.start_epoch is positive, it will load the checkpoint from
+    If params.start_batch is positive, it will load the checkpoint from
-    `params.start_epoch - 1`. Otherwise, this function does nothing.
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
    params.start_epoch is positive, it will load the checkpoint from
    `params.start_epoch - 1`.
-    Apart from loading state dict for `model`, `optimizer` and `scheduler`,
+    Apart from loading state dict for `model` and `optimizer` it also updates
-    it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
    and `best_valid_loss` in `params`.
    Args:
@ -332,20 +368,22 @@ def load_checkpoint_if_available(
        The training model.
      optimizer:
        The optimizer that we are using.
      scheduler:
        The learning rate scheduler we are using.
    Returns:
-      Return None.
+      Return a dict containing previously saved training info.
    """
-    if params.start_epoch <= 0:
+    if params.start_batch > 0:
-        return
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
    elif params.start_epoch > 0:
        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    else:
        return None
    assert filename.is_file(), f"{filename} does not exist!"
    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    saved_params = load_checkpoint(
        filename,
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
    )
    keys = [
@ -354,10 +392,13 @@ def load_checkpoint_if_available(
        "batch_idx_train",
        "best_train_loss",
        "best_valid_loss",
        "cur_batch_idx",
    ]
    for k in keys:
        params[k] = saved_params[k]
    params["start_epoch"] = saved_params["cur_epoch"]
    return saved_params
@ -365,7 +406,7 @@ def save_checkpoint(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    sampler: Optional[CutSampler] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -375,6 +416,10 @@ def save_checkpoint(
        It is returned by :func:`get_params`.
      model:
        The training model.
      optimizer:
        The optimizer used in the training.
      sampler:
       The sampler for the training dataset.
    """
    if rank != 0:
        return
@ -384,7 +429,7 @@ def save_checkpoint(
        model=model,
        params=params,
        optimizer=optimizer,
-        scheduler=scheduler,
+        sampler=sampler,
        rank=rank,
    )
@ -500,6 +545,7 @@ def train_one_epoch(
    valid_dl: torch.utils.data.DataLoader,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
 ) -> None:
    """Train the model for one epoch.
@ -522,6 +568,9 @@ def train_one_epoch(
        Writer to write log messages to tensorboard.
      world_size:
        Number of nodes in DDP training. If it is 1, DDP is disabled.
      rank:
        The rank of the node in DDP training. If no DDP is used, it should
        be set to 0.
    """
    model.train()
@ -566,7 +615,13 @@ def train_one_epoch(
        else:
            optimizer.step()
    cur_batch_idx = params.get("cur_batch_idx", 0)
    for batch_idx, batch in enumerate(train_dl):
        if batch_idx < cur_batch_idx:
            continue
        cur_batch_idx = batch_idx
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
@ -591,6 +646,27 @@ def train_one_epoch(
        optimizer.zero_grad()
        if (
            params.batch_idx_train > 0
            and params.batch_idx_train % params.save_every_n == 0
        ):
            params.cur_batch_idx = batch_idx
            save_checkpoint_with_global_batch_idx(
                out_dir=params.exp_dir,
                global_batch_idx=params.batch_idx_train,
                model=model,
                params=params,
                optimizer=optimizer,
                sampler=train_dl.sampler,
                rank=rank,
            )
            del params.cur_batch_idx
            remove_checkpoints(
                out_dir=params.exp_dir,
                topk=params.keep_last_k,
                rank=rank,
            )
        if batch_idx % params.log_interval == 0:
            logging.info(
                f"Epoch {params.cur_epoch}, "
@ -598,8 +674,6 @@ def train_one_epoch(
                f"tot_loss[{tot_loss}], batch size: {batch_size}"
            )
        if batch_idx % params.log_interval == 0:
            if tb_writer is not None:
                loss_info.write_summary(
                    tb_writer, "train/current_", params.batch_idx_train
@ -723,7 +797,14 @@ def run(rank, world_size, args):
    logging.info(f"After removing short and long utterances: {num_left}")
    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
-    train_dl = librispeech.train_dataloaders(train_cuts)
+    if checkpoints and "sampler" in checkpoints:
        sampler_state_dict = checkpoints["sampler"]
    else:
        sampler_state_dict = None
    train_dl = librispeech.train_dataloaders(
        train_cuts, sampler_state_dict=sampler_state_dict
    )
    valid_cuts = librispeech.dev_clean_cuts()
    valid_cuts += librispeech.dev_other_cuts()
@ -762,12 +843,14 @@ def run(rank, world_size, args):
            valid_dl=valid_dl,
            tb_writer=tb_writer,
            world_size=world_size,
            rank=rank,
        )
        save_checkpoint(
            params=params,
            model=model,
            optimizer=optimizer,
            sampler=train_dl.sampler,
            rank=rank,
        )
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -1,4 +1,5 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -16,9 +17,11 @@
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, Optional
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest
 from lhotse.dataset import (
@ -179,15 +182,25 @@ class LibriSpeechAsrDataModule:
            "with training dataset. ",
        )
-    def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
+    def train_dataloaders(
-        logging.info("About to get Musan cuts")
+        self,
-        cuts_musan = load_manifest(
+        cuts_train: CutSet,
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
-        )
+    ) -> DataLoader:
-
+        """
        Args:
          cuts_train:
            CutSet for training.
          sampler_state_dict:
            The state dict for the training sampler.
        """
        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            logging.info("About to get Musan cuts")
            cuts_musan = load_manifest(
                self.args.manifest_dir / "cuts_musan.json.gz"
            )
            transforms.append(
                CutMix(
                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
@ -216,10 +229,20 @@ class LibriSpeechAsrDataModule:
            logging.info(
                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
            )
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=2,
+                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
@ -274,6 +297,10 @@ class LibriSpeechAsrDataModule:
            )
        logging.info("About to create train dataloader")
        if sampler_state_dict is not None:
            logging.info("Loading sampler state dict")
            train_sampler.load_state_dict(sampler_state_dict)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
--- a/egs/librispeech/ASR/transducer_stateless/README.md
+++ b/egs/librispeech/ASR/transducer_stateless/README.md
@ -20,3 +20,120 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
  --max-duration 250 \
  --lr-factor 2.5
 ```
 ## How to get framewise token alignment
 Assume that you already have a trained model. If not, you can either
 train one by yourself or download a pre-trained model from hugging face:
 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01>
 **Caution**: If you are going to use your own trained model, remember
 to set `--modified-transducer-prob` to a nonzero value since the
 force alignment code assumes that `--max-sym-per-frame` is 1.
 The following shows how to get framewise token alignment using the above
 pre-trained model.
 ```bash
 git clone https://github.com/k2-fsa/icefall
 cd icefall/egs/librispeech/ASR
 mkdir tmp
 sudo apt-get install git-lfs
 git lfs install
 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01 ./tmp/
 ln -s $PWD/tmp/exp/pretrained.pt $PWD/tmp/epoch-999.pt
 ./transducer_stateless/compute_ali.py \
        --exp-dir ./tmp/exp \
        --bpe-model ./tmp/data/lang_bpe_500/bpe.model \
        --epoch 999 \
        --avg 1 \
        --max-duration 100 \
        --dataset dev-clean \
        --out-dir data/ali
 ```
 After running the above commands, you will find the following two files
 in the folder `./data/ali`:
 ```
 -rw-r--r-- 1 xxx xxx 412K Mar  7 15:45 cuts_dev-clean.json.gz
 -rw-r--r-- 1 xxx xxx 2.9M Mar  7 15:45 token_ali_dev-clean.h5
 ```
 You can find usage examples in `./test_compute_ali.py` about
 extracting framewise token alignment information from the above
 two files.
 ## How to get word starting time from framewise token alignment
 Assume you have run the above commands to get framewise token alignment
 using a pre-trained model from `tmp/exp/epoch-999.pt`. You can use the following
 commands to obtain word starting time.
 ```bash
 ./transducer_stateless/test_compute_ali.py \
        --bpe-model ./tmp/data/lang_bpe_500/bpe.model \
        --ali-dir data/ali \
        --dataset dev-clean
 ```
 **Caution**: Since the frame shift is 10ms and the subsampling factor
 of the model is 4, the time resolution is 0.04 second.
 **Note**: The script `test_compute_ali.py` is for illustration only
 and it processes only one batch and then exits.
 You will get the following output:
 ```
 5694-64029-0022-1998-0
 [('THE', '0.20'), ('LEADEN', '0.36'), ('HAIL', '0.72'), ('STORM', '1.00'), ('SWEPT', '1.48'), ('THEM', '1.88'), ('OFF', '2.00'), ('THE', '2.24'), ('FIELD', '2.36'), ('THEY', '3.20'), ('FELL', '3.36'), ('BACK', '3.64'), ('AND', '3.92'), ('RE', '4.04'), ('FORMED', '4.20')]
 3081-166546-0040-308-0
 [('IN', '0.32'), ('OLDEN', '0.60'), ('DAYS', '1.00'), ('THEY', '1.40'), ('WOULD', '1.56'), ('HAVE', '1.76'), ('SAID', '1.92'), ('STRUCK', '2.60'), ('BY', '3.16'), ('A', '3.36'), ('BOLT', '3.44'), ('FROM', '3.84'), ('HEAVEN', '4.04')]
 2035-147960-0016-1283-0
 [('A', '0.44'), ('SNAKE', '0.52'), ('OF', '0.84'), ('HIS', '0.96'), ('SIZE', '1.12'), ('IN', '1.60'), ('FIGHTING', '1.72'), ('TRIM', '2.12'), ('WOULD', '2.56'), ('BE', '2.76'), ('MORE', '2.88'), ('THAN', '3.08'), ('ANY', '3.28'), ('BOY', '3.56'), ('COULD', '3.88'), ('HANDLE', '4.04')]
 2428-83699-0020-1734-0
 [('WHEN', '0.28'), ('THE', '0.48'), ('TRAP', '0.60'), ('DID', '0.88'), ('APPEAR', '1.08'), ('IT', '1.80'), ('LOOKED', '1.96'), ('TO',
 '2.24'), ('ME', '2.36'), ('UNCOMMONLY', '2.52'), ('LIKE', '3.16'), ('AN', '3.40'), ('OPEN', '3.56'), ('SPRING', '3.92'), ('CART', '4.28')]
 8297-275154-0026-2108-0
 [('LET', '0.44'), ('ME', '0.72'), ('REST', '0.92'), ('A', '1.32'), ('LITTLE', '1.40'), ('HE', '1.80'), ('PLEADED', '2.00'), ('IF', '3.04'), ("I'M", '3.28'), ('NOT', '3.52'), ('IN', '3.76'), ('THE', '3.88'), ('WAY', '4.00')]
 652-129742-0007-1002-0
 [('SURROUND', '0.28'), ('WITH', '0.80'), ('A', '0.92'), ('GARNISH', '1.00'), ('OF', '1.44'), ('COOKED', '1.56'), ('AND', '1.88'), ('DICED', '4.16'), ('CARROTS', '4.28'), ('TURNIPS', '4.44'), ('GREEN', '4.60'), ('PEAS', '4.72')]
 ```
 For the row:
 ```
 5694-64029-0022-1998-0
 [('THE', '0.20'), ('LEADEN', '0.36'), ('HAIL', '0.72'), ('STORM', '1.00'), ('SWEPT', '1.48'),
 ('THEM', '1.88'), ('OFF', '2.00'), ('THE', '2.24'), ('FIELD', '2.36'), ('THEY', '3.20'), ('FELL', '3.36'),
 ('BACK', '3.64'), ('AND', '3.92'), ('RE', '4.04'), ('FORMED', '4.20')]
 ```
 - `5694-64029-0022-1998-0` is the cut ID.
 - `('THE', '0.20')` means the word `THE` starts at 0.20 second.
 - `('LEADEN', '0.36')` means the word `LEADEN` starts at 0.36 second.
 You can compare the above word starting time with the one
 from <https://github.com/CorentinJ/librispeech-alignments>
 ```
 5694-64029-0022 ",THE,LEADEN,HAIL,STORM,SWEPT,THEM,OFF,THE,FIELD,,THEY,FELL,BACK,AND,RE,FORMED," "0.230,0.360,0.670,1.010,1.440,1.860,1.990,2.230,2.350,2.870,3.230,3.390,3.660,3.960,4.060,4.160,4.850,4.9"
 ```
 We reformat it below for readability:
 ```
 5694-64029-0022 ",THE,LEADEN,HAIL,STORM,SWEPT,THEM,OFF,THE,FIELD,,THEY,FELL,BACK,AND,RE,FORMED,"
 "0.230,0.360,0.670,1.010,1.440,1.860,1.990,2.230,2.350,2.870,3.230,3.390,3.660,3.960,4.060,4.160,4.850,4.9"
  the  leaden hail storm swept them  off   the   field  sil   they  fell  back  and   re   formed  sil
 ```
--- a/egs/librispeech/ASR/transducer_stateless/alignment.py
+++ b/egs/librispeech/ASR/transducer_stateless/alignment.py
@ -0,0 +1,268 @@
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
 from typing import Iterator, List, Optional
 import sentencepiece as spm
 import torch
 from model import Transducer
 # The force alignment problem can be formulated as finding
 # a path in a rectangular lattice, where the path starts
 # from the lower left corner and ends at the upper right
 # corner. The horizontal axis of the lattice is `t` (representing
 # acoustic frame indexes) and the vertical axis is `u` (representing
 # BPE tokens of the transcript).
 #
 # The notations `t` and `u` are from the paper
 # https://arxiv.org/pdf/1211.3711.pdf
 #
 # Beam search is used to find the path with the
 # highest log probabilities.
 #
 # It assumes the maximum number of symbols that can be
 # emitted per frame is 1. You can use `--modified-transducer-prob`
 # from `./train.py` to train a model that satisfies this assumption.
 # AlignItem is the ending node of a path originated from the starting node.
 # len(ys) equals to `t` and pos_u is the u coordinate
 # in the lattice.
@dataclass
 class AlignItem:
    # total log prob of the path that ends at this item.
    # The path is originated from the starting node.
    log_prob: float
    # It contains framewise token alignment
    ys: List[int]
    # It equals to the number of non-zero entries in ys
    pos_u: int
 class AlignItemList:
    def __init__(self, items: Optional[List[AlignItem]] = None):
        """
        Args:
          items:
            A list of AlignItem
        """
        if items is None:
            items = []
        self.data = items
    def __iter__(self) -> Iterator:
        return iter(self.data)
    def __len__(self) -> int:
        """Return the number of AlignItem in this object."""
        return len(self.data)
    def __getitem__(self, i: int) -> AlignItem:
        """Return the i-th item in this object."""
        return self.data[i]
    def append(self, item: AlignItem) -> None:
        """Append an item to the end of this object."""
        self.data.append(item)
    def get_decoder_input(
        self,
        ys: List[int],
        context_size: int,
        blank_id: int,
    ) -> List[List[int]]:
        """Get input for the decoder for each item in this object.
        Args:
          ys:
            The transcript of the utterance in BPE tokens.
          context_size:
            Context size of the NN decoder model.
          blank_id:
            The ID of the blank symbol.
        Returns:
          Return a list-of-list int. `ans[i]` contains the decoder
          input for the i-th item in this object and its lengths
          is `context_size`.
        """
        ans: List[List[int]] = []
        buf = [blank_id] * context_size + ys
        for item in self:
            # fmt: off
            ans.append(buf[item.pos_u:(item.pos_u + context_size)])
            # fmt: on
        return ans
    def topk(self, k: int) -> "AlignItemList":
        """Return the top-k items.
        Items are ordered by their log probs in descending order
        and the top-k items are returned.
        Args:
          k:
            Size of top-k.
        Returns:
          Return a new AlignItemList that contains the top-k items
          in this object. Caution: It uses shallow copy.
        """
        items = list(self)
        items = sorted(items, key=lambda i: i.log_prob, reverse=True)
        return AlignItemList(items[:k])
 def force_alignment(
    model: Transducer,
    encoder_out: torch.Tensor,
    ys: List[int],
    beam_size: int = 4,
 ) -> List[int]:
    """Compute the force alignment of an utterance given its transcript
    in BPE tokens and the corresponding acoustic output from the encoder.
    Caution:
      We assume that the maximum number of sybmols per frame is 1.
      That is, the model should be trained using a nonzero value
      for the option `--modified-transducer-prob` in train.py.
    Args:
      model:
        The transducer model.
      encoder_out:
        A tensor of shape (N, T, C). Support only for N==1 at present.
      ys:
        A list of BPE token IDs. We require that len(ys) <= T.
      beam_size:
        Size of the beam used in beam search.
    Returns:
      Return a list of int such that
        - len(ans) == T
        - After removing blanks from ans, we have ans == ys.
    """
    assert encoder_out.ndim == 3, encoder_out.ndim
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    assert 0 < len(ys) <= encoder_out.size(1), (len(ys), encoder_out.size(1))
    blank_id = model.decoder.blank_id
    context_size = model.decoder.context_size
    device = model.device
    T = encoder_out.size(1)
    U = len(ys)
    assert 0 < U <= T
    encoder_out_len = torch.tensor([1])
    decoder_out_len = encoder_out_len
    start = AlignItem(log_prob=0.0, ys=[], pos_u=0)
    B = AlignItemList([start])
    for t in range(T):
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :]
        # current_encoder_out is of shape (1, 1, encoder_out_dim)
        # fmt: on
        A = B  # shallow copy
        B = AlignItemList()
        decoder_input = A.get_decoder_input(
            ys=ys, context_size=context_size, blank_id=blank_id
        )
        decoder_input = torch.tensor(decoder_input, device=device)
        # decoder_input is of shape (num_active_items, context_size)
        decoder_out = model.decoder(decoder_input, need_pad=False)
        # decoder_output is of shape (num_active_items, 1, decoder_output_dim)
        current_encoder_out = current_encoder_out.expand(
            decoder_out.size(0), 1, -1
        )
        logits = model.joiner(
            current_encoder_out,
            decoder_out,
            encoder_out_len.expand(decoder_out.size(0)),
            decoder_out_len.expand(decoder_out.size(0)),
        )
        # logits is of shape (num_active_items, vocab_size)
        log_probs = logits.log_softmax(dim=-1).tolist()
        for i, item in enumerate(A):
            if (T - 1 - t) >= (U - item.pos_u):
                # horizontal transition (left -> right)
                new_item = AlignItem(
                    log_prob=item.log_prob + log_probs[i][blank_id],
                    ys=item.ys + [blank_id],
                    pos_u=item.pos_u,
                )
                B.append(new_item)
            if item.pos_u < U:
                # diagonal transition (lower left -> upper right)
                u = ys[item.pos_u]
                new_item = AlignItem(
                    log_prob=item.log_prob + log_probs[i][u],
                    ys=item.ys + [u],
                    pos_u=item.pos_u + 1,
                )
                B.append(new_item)
        if len(B) > beam_size:
            B = B.topk(beam_size)
    ans = B.topk(1)[0].ys
    assert len(ans) == T
    assert list(filter(lambda i: i != blank_id, ans)) == ys
    return ans
 def get_word_starting_frames(
    ali: List[int], sp: spm.SentencePieceProcessor
 ) -> List[int]:
    """Get the starting frame of each word from the given token alignments.
    When a word is encoded into BPE tokens, the first token starts
    with underscore "_", which can be used to identify the starting frame
    of a word.
    Args:
      ali:
        Framewise token alignment. It can be the return value of
        :func:`force_alignment`.
      sp:
        The sentencepiece model.
    Returns:
      Return a list of int representing the starting frame of each word
      in the alignment.
      Caution:
        You have to take into account the model subsampling factor when
        converting the starting frame into time.
    """
    underscore = b"\xe2\x96\x81".decode()  # '_'
    ans = []
    for i in range(len(ali)):
        if sp.id_to_piece(ali[i]).startswith(underscore):
            ans.append(i)
    return ans
--- a/egs/librispeech/ASR/transducer_stateless/compute_ali.py
+++ b/egs/librispeech/ASR/transducer_stateless/compute_ali.py
@ -0,0 +1,326 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
    ./transducer_stateless/compute_ali.py \
            --exp-dir ./transducer_stateless/exp \
            --bpe-model ./data/lang_bpe_500/bpe.model \
            --epoch 20 \
            --avg 10 \
            --max-duration 300 \
            --dataset train-clean-100 \
            --out-dir data/ali
 """
 import argparse
 import logging
 from pathlib import Path
 from typing import List
 import numpy as np
 import sentencepiece as spm
 import torch
 from alignment import force_alignment
 from asr_datamodule import LibriSpeechAsrDataModule
 from lhotse import CutSet
 from lhotse.features.io import FeaturesWriter, NumpyHdf5Writer
 from train import get_params, get_transducer_model
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.utils import AttributeDict, setup_logger
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=34,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=20,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="transducer_stateless/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--out-dir",
        type=str,
        required=True,
        help="""Output directory.
        It contains 2 generated files:
        - token_ali_xxx.h5
        - cuts_xxx.json.gz
        where xxx is the value of `--dataset`. For instance, if
        `--dataset` is `train-clean-100`, it will contain 2 files:
        - `token_ali_train-clean-100.h5`
        - `cuts_train-clean-100.json.gz`
        """,
    )
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        help="""The name of the dataset to compute alignments for.
        Possible values are:
            - test-clean.
            - test-other
            - train-clean-100
            - train-clean-360
            - train-other-500
            - dev-clean
            - dev-other
        """,
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    return parser
 def compute_alignments(
    model: torch.nn.Module,
    dl: torch.utils.data,
    ali_writer: FeaturesWriter,
    params: AttributeDict,
    sp: spm.SentencePieceProcessor,
 ):
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    num_cuts = 0
    device = model.device
    cuts = []
    for batch_idx, batch in enumerate(dl):
        feature = batch["inputs"]
        # at entry, feature is [N, T, C]
        assert feature.ndim == 3
        feature = feature.to(device)
        supervisions = batch["supervisions"]
        cut_list = supervisions["cut"]
        for cut in cut_list:
            assert len(cut.supervisions) == 1, f"{len(cut.supervisions)}"
        feature_lens = supervisions["num_frames"].to(device)
        encoder_out, encoder_out_lens = model.encoder(
            x=feature, x_lens=feature_lens
        )
        batch_size = encoder_out.size(0)
        texts = supervisions["text"]
        ys_list: List[List[int]] = sp.encode(texts, out_type=int)
        ali_list = []
        for i in range(batch_size):
            # fmt: off
            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
            # fmt: on
            ali = force_alignment(
                model=model,
                encoder_out=encoder_out_i,
                ys=ys_list[i],
                beam_size=params.beam_size,
            )
            ali_list.append(ali)
        assert len(ali_list) == len(cut_list)
        for cut, ali in zip(cut_list, ali_list):
            cut.token_alignment = ali_writer.store_array(
                key=cut.id,
                value=np.asarray(ali, dtype=np.int32),
                # frame shift is 0.01s, subsampling_factor is 4
                frame_shift=0.04,
                temporal_dim=0,
                start=0,
            )
        cuts += cut_list
        num_cuts += len(cut_list)
        if batch_idx % 2 == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(
                f"batch {batch_str}, cuts processed until now is {num_cuts}"
            )
    return CutSet.from_cuts(cuts)
@torch.no_grad()
 def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.enable_spec_aug = False
    args.enable_musan = False
    args.return_cuts = True
    args.concatenate_cuts = False
    params = get_params()
    params.update(vars(args))
    setup_logger(f"{params.exp_dir}/log-ali")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> is defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(f"Computing alignments for {params.dataset} - started")
    logging.info(params)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    out_dir = Path(params.out_dir)
    out_dir.mkdir(exist_ok=True)
    out_ali_filename = out_dir / f"token_ali_{params.dataset}.h5"
    out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
    done_file = out_dir / f".{params.dataset}.done"
    if done_file.is_file():
        logging.info(f"{done_file} exists - skipping")
        exit()
    logging.info("About to create model")
    model = get_transducer_model(params)
    if params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(
            average_checkpoints(filenames, device=device), strict=False
        )
    model.to(device)
    model.eval()
    model.device = device
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    librispeech = LibriSpeechAsrDataModule(args)
    if params.dataset == "test-clean":
        test_clean_cuts = librispeech.test_clean_cuts()
        dl = librispeech.test_dataloaders(test_clean_cuts)
    elif params.dataset == "test-other":
        test_other_cuts = librispeech.test_other_cuts()
        dl = librispeech.test_dataloaders(test_other_cuts)
    elif params.dataset == "train-clean-100":
        train_clean_100_cuts = librispeech.train_clean_100_cuts()
        dl = librispeech.train_dataloaders(train_clean_100_cuts)
    elif params.dataset == "train-clean-360":
        train_clean_360_cuts = librispeech.train_clean_360_cuts()
        dl = librispeech.train_dataloaders(train_clean_360_cuts)
    elif params.dataset == "train-other-500":
        train_other_500_cuts = librispeech.train_other_500_cuts()
        dl = librispeech.train_dataloaders(train_other_500_cuts)
    elif params.dataset == "dev-clean":
        dev_clean_cuts = librispeech.dev_clean_cuts()
        dl = librispeech.valid_dataloaders(dev_clean_cuts)
    else:
        assert params.dataset == "dev-other", f"{params.dataset}"
        dev_other_cuts = librispeech.dev_other_cuts()
        dl = librispeech.valid_dataloaders(dev_other_cuts)
    logging.info(f"Processing {params.dataset}")
    with NumpyHdf5Writer(out_ali_filename) as ali_writer:
        cut_set = compute_alignments(
            model=model,
            dl=dl,
            ali_writer=ali_writer,
            params=params,
            sp=sp,
        )
    cut_set.to_file(out_manifest_filename)
    logging.info(
        f"For dataset {params.dataset}, its framewise token alignments are "
        f"saved to {out_ali_filename} and the cut manifest "
        f"file is {out_manifest_filename}. Number of cuts: {len(cut_set)}"
    )
    done_file.touch()
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
+++ b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
@ -0,0 +1,167 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script shows how to get word starting time
 from framewise token alignment.
 Usage:
    ./transducer_stateless/compute_ali.py \
            --exp-dir ./transducer_stateless/exp \
            --bpe-model ./data/lang_bpe_500/bpe.model \
            --epoch 20 \
            --avg 10 \
            --max-duration 300 \
            --dataset train-clean-100 \
            --out-dir data/ali
 And the you can run:
    ./transducer_stateless/test_compute_ali.py \
            --bpe-model ./data/lang_bpe_500/bpe.model \
            --ali-dir data/ali \
            --dataset train-clean-100
 """
 import argparse
 import logging
 from pathlib import Path
 import sentencepiece as spm
 import torch
 from alignment import get_word_starting_frames
 from lhotse import CutSet, load_manifest
 from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
 from lhotse.dataset.collation import collate_custom_field
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--ali-dir",
        type=Path,
        default="./data/ali",
        help="It specifies the directory where alignments can be found.",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        help="""The name of the dataset:
        Possible values are:
            - test-clean.
            - test-other
            - train-clean-100
            - train-clean-360
            - train-other-500
            - dev-clean
            - dev-other
        """,
    )
    return parser
 def main():
    args = get_parser().parse_args()
    sp = spm.SentencePieceProcessor()
    sp.load(args.bpe_model)
    cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz"
    logging.info(f"Loading {cuts_json}")
    cuts = load_manifest(cuts_json)
    sampler = SingleCutSampler(
        cuts,
        max_duration=30,
        shuffle=False,
    )
    dataset = K2SpeechRecognitionDataset(return_cuts=True)
    dl = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=None,
        num_workers=1,
        persistent_workers=False,
    )
    frame_shift = 10  # ms
    subsampling_factor = 4
    frame_shift_in_second = frame_shift * subsampling_factor / 1000.0
    # key: cut.id
    # value: a list of pairs (word, time_in_second)
    word_starting_time_dict = {}
    for batch in dl:
        supervisions = batch["supervisions"]
        cuts = supervisions["cut"]
        token_alignment, token_alignment_length = collate_custom_field(
            CutSet.from_cuts(cuts), "token_alignment"
        )
        for i in range(len(cuts)):
            assert (
                (cuts[i].features.num_frames - 1) // 2 - 1
            ) // 2 == token_alignment_length[i]
            word_starting_frames = get_word_starting_frames(
                token_alignment[i, : token_alignment_length[i]].tolist(), sp=sp
            )
            word_starting_time = [
                "{:.2f}".format(i * frame_shift_in_second)
                for i in word_starting_frames
            ]
            words = supervisions["text"][i].split()
            assert len(word_starting_frames) == len(words)
            word_starting_time_dict[cuts[i].id] = list(
                zip(words, word_starting_time)
            )
        # This is a demo script and we exit here after processing
        # one batch.
        # You can find word starting time in the dict "word_starting_time_dict"
        for cut_id, word_time in word_starting_time_dict.items():
            print(f"{cut_id}\n{word_time}\n")
        break
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
@ -1,5 +1,6 @@
 # Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#                2022  Xiaomi Corp.        (authors: Fangjun Kuang
 # 						     Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -16,6 +17,7 @@
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from pathlib import Path
 from typing import Optional
@ -180,10 +182,20 @@ class AsrDataModule:
            logging.info(
                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
            )
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=2,
+                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
--- a/egs/tedlium3/ASR/README.md
+++ b/egs/tedlium3/ASR/README.md
@ -0,0 +1,18 @@
 # Introduction
 This recipe includes some different ASR models trained with TedLium3.
 # Transducers
 There are various folders containing the name `transducer` in this folder.
 The following table lists the differences among them.
 |                        | Encoder   | Decoder            |
 |------------------------|-----------|--------------------|
 | `transducer_stateless` | Conformer | Embedding + Conv1d |
 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
 We place an additional Conv1d layer right after the input embedding layer.
--- a/egs/tedlium3/ASR/RESULTS.md
+++ b/egs/tedlium3/ASR/RESULTS.md
@ -0,0 +1,71 @@
 ## Results
 ### TedLium3 BPE training results (Transducer)
 #### Conformer encoder + embedding decoder
 Using the codes from this PR https://github.com/k2-fsa/icefall/pull/233
 And the SpecAugment codes from this PR https://github.com/lhotse-speech/lhotse/pull/604
 Conformer encoder + non-current decoder. The decoder
 contains only an embedding layer and a Conv1d (with kernel size 2).
 The WERs are
 |                                    |     dev    |    test    | comment                                  |
 |------------------------------------|------------|------------|------------------------------------------|
 |          greedy search             | 7.19       | 6.57       | --epoch 29, --avg 16, --max-duration 100 |
 |      beam search (beam size 4)     | 7.12       | 6.37       | --epoch 29, --avg 16, --max-duration 100 |
 | modified beam search (beam size 4) | 7.00       | 6.19       | --epoch 29, --avg 16, --max-duration 100 |
 The training command for reproducing is given below:
 ```
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./transducer_stateless/train.py \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 0 \
  --exp-dir transducer_stateless/exp \
  --max-duration 200
 ```
 The tensorboard training log can be found at
 https://tensorboard.dev/experiment/zrfXeJO3Q5GmJpP2KRd2VA/#scalars
 The decoding command is:
 ```
 epoch=29
 avg=16
 ## greedy search
 ./transducer_stateless/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir transducer_stateless/exp \
  --bpe-model ./data/lang_bpe_500/bpe.model \
  --max-duration 100
 ## beam search
 ./transducer_stateless/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir transducer_stateless/exp \
  --bpe-model ./data/lang_bpe_500/bpe.model \
  --max-duration 100 \
  --decoding-method beam_search \
  --beam-size 4
 ## modified beam search
 ./transducer_stateless/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir transducer_stateless/exp \
  --bpe-model ./data/lang_bpe_500/bpe.model \
  --max-duration 100 \
  --decoding-method modified_beam_search \
  --beam-size 4
 ```
 A pre-trained model and decoding logs can be found at <https://huggingface.co/luomingshuang/icefall_asr_tedlium3_transducer_stateless>
--- a/egs/tedlium3/ASR/local/init.py
+++ b/egs/tedlium3/ASR/local/init.py
--- a/egs/tedlium3/ASR/local/compile_hlg.py
+++ b/egs/tedlium3/ASR/local/compile_hlg.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/compile_hlg.py
--- a/egs/tedlium3/ASR/local/compute_fbank_musan.py
+++ b/egs/tedlium3/ASR/local/compute_fbank_musan.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
+++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
@ -0,0 +1,101 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 # 	       2022  Xiaomi Crop.        (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the TedLium3 dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import logging
 import os
 from pathlib import Path
 import torch
 from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_tedlium():
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    num_mel_bins = 80
    dataset_parts = (
        "train",
        "dev",
        "test",
    )
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts, output_dir=src_dir
    )
    assert manifests is not None
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            if (output_dir / f"cuts_{partition}.json.gz").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if "train" in partition:
                cut_set = (
                    cut_set
                    + cut_set.perturb_speed(0.9)
                    + cut_set.perturb_speed(1.1)
                )
            cur_num_jobs = num_jobs if ex is None else 80
            cur_num_jobs = min(cur_num_jobs, len(cut_set))
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=cur_num_jobs,
                executor=ex,
                storage_type=ChunkedLilcomHdf5Writer,
            )
            # Split long cuts into many short and un-overlapping cuts
            cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_tedlium()
--- a/egs/tedlium3/ASR/local/convert_transcript_words_to_bpe_ids.py
+++ b/egs/tedlium3/ASR/local/convert_transcript_words_to_bpe_ids.py
@ -0,0 +1,92 @@
 #!/usr/bin/env python3
 # Copyright    2022 Xiaomi Corporation  (Author: Mingshuang Luo)
 """
 Convert a transcript based on words to a list of BPE ids.
 For example, if we use 2 as the encoding id of <unk>:
 texts = ['this is a <unk> day']
 spm_ids = [[38, 33, 6, 2, 316]]
 texts = ['<unk> this is a sunny day']
 spm_ids = [[2, 38, 33, 6, 118, 11, 11, 21, 316]]
 texts = ['<unk>']
 spm_ids = [[2]]
 """
 import argparse
 import logging
 from typing import List
 import sentencepiece as spm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--texts", type=List[str], help="The input transcripts list."
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    return parser.parse_args()
 def convert_texts_into_ids(
    texts: List[str],
    unk_id: int,
    sp: spm.SentencePieceProcessor,
 ) -> List[List[int]]:
    """
    Args:
      texts:
        A string list of transcripts, such as ['Today is Monday', 'It's sunny'].
      unk_id:
        A number id for the token '<unk>'.
    Returns:
      Return an integer list of bpe ids.
    """
    y = []
    for text in texts:
        y_ids = []
        if "<unk>" in text:
            text_segments = text.split("<unk>")
            id_segments = sp.encode(text_segments, out_type=int)
            for i in range(len(id_segments)):
                if i != len(id_segments) - 1:
                    y_ids.extend(id_segments[i] + [unk_id])
                else:
                    y_ids.extend(id_segments[i])
        else:
            y_ids = sp.encode(text, out_type=int)
        y.append(y_ids)
    return y
 def main():
    args = get_args()
    texts = args.texts
    bpe_model = args.bpe_model
    sp = spm.SentencePieceProcessor()
    sp.load(bpe_model)
    unk_id = sp.piece_to_id("<unk>")
    y = convert_texts_into_ids(
        texts=texts,
        unk_id=unk_id,
        sp=sp,
    )
    logging.info(f"The input texts: {texts}")
    logging.info(f"The encoding ids: {y}")
 if __name__ == "__main__":
    main()
--- a/egs/tedlium3/ASR/local/convert_transcript_words_to_tokens.py
+++ b/egs/tedlium3/ASR/local/convert_transcript_words_to_tokens.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py
--- a/egs/tedlium3/ASR/local/display_manifest_statistics.py
+++ b/egs/tedlium3/ASR/local/display_manifest_statistics.py
@ -0,0 +1,93 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
 # 						   Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file displays duration statistics of utterances in a manifest.
 You can use the displayed value to choose minimum/maximum duration
 to remove short and long utterances during the training.
 See the function `remove_short_and_long_utt()`
 in ../../../librispeech/ASR/transducer/train.py
 for usage.
 """
 from lhotse import load_manifest
 def main():
    path = "./data/fbank/cuts_train.json.gz"
    path = "./data/fbank/cuts_dev.json.gz"
    path = "./data/fbank/cuts_test.json.gz"
    cuts = load_manifest(path)
    cuts.describe()
 if __name__ == "__main__":
    main()
 """
 ## train
 Cuts count: 804789
 Total duration (hours): 1370.6
 Speech duration (hours): 1370.6 (100.0%)
 ***
 Duration statistics (seconds):
 mean    6.1
 std     3.1
 min     0.5
 25%     3.7
 50%     6.0
 75%     8.3
 99.5%   14.9
 99.9%   16.6
 max     33.3
 ## dev
 Cuts count: 507
 Total duration (hours): 1.6
 Speech duration (hours): 1.6 (100.0%)
 ***
 Duration statistics (seconds):
 mean    11.3
 std     5.7
 min     0.5
 25%     7.5
 50%     10.6
 75%     14.4
 99.5%   29.8
 99.9%   37.7
 max     39.9
 ## test
 Cuts count: 1155
 Total duration (hours): 2.6
 Speech duration (hours): 2.6 (100.0%)
 ***
 Duration statistics (seconds):
 mean    8.2
 std     4.3
 min     0.3
 25%     4.6
 50%     8.2
 75%     10.9
 99.5%   22.1
 99.9%   26.7
 max     32.5
 """
--- a/egs/tedlium3/ASR/local/generate_unique_lexicon.py
+++ b/egs/tedlium3/ASR/local/generate_unique_lexicon.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/generate_unique_lexicon.py
--- a/egs/tedlium3/ASR/local/prepare_lang.py
+++ b/egs/tedlium3/ASR/local/prepare_lang.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/prepare_lang.py
--- a/egs/tedlium3/ASR/local/prepare_lang_bpe.py
+++ b/egs/tedlium3/ASR/local/prepare_lang_bpe.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/prepare_lang_bpe.py
--- a/egs/tedlium3/ASR/local/prepare_lexicon.py
+++ b/egs/tedlium3/ASR/local/prepare_lexicon.py
@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input supervisions json dir "data/manifests"
 consisting of supervisions_train.json and does the following:
 1. Generate lexicon_words.txt.
 """
 import argparse
 import json
 import logging
 from pathlib import Path
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--manifests-dir",
        type=str,
        help="""Input directory.
        """,
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Output directory.
        """,
    )
    return parser.parse_args()
 def prepare_lexicon(manifests_dir: str, lang_dir: str):
    """
    Args:
      manifests_dir:
        The manifests directory, e.g., data/manifests.
      lang_dir:
        The language directory, e.g., data/lang_phone.
    Return:
      The lexicon_words.txt file.
    """
    words = set()
    supervisions_train = Path(manifests_dir) / "supervisions_train.json"
    lexicon = Path(lang_dir) / "lexicon_words.txt"
    logging.info(f"Loading {supervisions_train}!")
    with open(supervisions_train, "r") as load_f:
        load_dicts = json.load(load_f)
        for load_dict in load_dicts:
            text = load_dict["text"]
            # list the words units and filter the empty item
            words_list = list(filter(None, text.split()))
            for word in words_list:
                if word not in words and word != "<unk>":
                    words.add(word)
    with open(lexicon, "w") as f:
        for word in sorted(words):
            f.write(word + "  " + word)
            f.write("\n")
 def main():
    args = get_args()
    manifests_dir = Path(args.manifests_dir)
    lang_dir = Path(args.lang_dir)
    logging.info("Generating lexicon_words.txt")
    prepare_lexicon(manifests_dir, lang_dir)
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/tedlium3/ASR/local/prepare_transcripts.py
+++ b/egs/tedlium3/ASR/local/prepare_transcripts.py
@ -0,0 +1,95 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input supervisions json dir "data/manifests"
 consisting of supervisions_train.json and does the following:
 1. Generate train.text.
 """
 import argparse
 import json
 import logging
 from pathlib import Path
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--manifests-dir",
        type=str,
        help="""Input directory.
        """,
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Output directory.
        """,
    )
    return parser.parse_args()
 def prepare_transcripts(manifests_dir: str, lang_dir: str):
    """
    Args:
      manifests_dir:
        The manifests directory, e.g., data/manifests.
      lang_dir:
        The language directory, e.g., data/lang_phone.
    Return:
      The train.text in lang_dir.
    """
    texts = []
    supervisions_train = Path(manifests_dir) / "supervisions_train.json"
    train_text = Path(lang_dir) / "train.text"
    logging.info(f"Loading {supervisions_train}!")
    with open(supervisions_train, "r") as load_f:
        load_dicts = json.load(load_f)
        for load_dict in load_dicts:
            text = load_dict["text"]
            texts.append(text)
    with open(train_text, "w") as f:
        for text in texts:
            f.write(text)
            f.write("\n")
 def main():
    args = get_args()
    manifests_dir = Path(args.manifests_dir)
    lang_dir = Path(args.lang_dir)
    logging.info("Generating train.text")
    prepare_transcripts(manifests_dir, lang_dir)
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/tedlium3/ASR/local/test_prepare_lang.py
+++ b/egs/tedlium3/ASR/local/test_prepare_lang.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/test_prepare_lang.py
--- a/egs/tedlium3/ASR/local/train_bpe_model.py
+++ b/egs/tedlium3/ASR/local/train_bpe_model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/train_bpe_model.py
--- a/egs/tedlium3/ASR/prepare.sh
+++ b/egs/tedlium3/ASR/prepare.sh
@ -0,0 +1,169 @@
 #!/usr/bin/env bash
 set -eou pipefail
 nj=15
 stage=0
 stop_stage=100
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
 # by this script automatically.
 #
 #  - $dl_dir/tedlium3
 #      You can find data, doc, legacy, LM, etc, inside it.
 #      You can download them from https://www.openslr.org/51
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
 #       http://www.openslr.org/17/
 #
 #     - music
 #     - noise
 #     - speech
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # vocab size for sentence piece models.
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
  5000
  2000
  1000
  500
 )
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  # If you have pre-downloaded it to /path/to/tedlium3,
  # you can create a symlink
  #
  # ln -sfv /path/to/tedlium3 $dl_dir/tedlium3
  #
  if [ ! -d $dl_dir/tedlium3 ]; then
    lhotse download tedlium $dl_dir
    mv $dl_dir/TEDLIUM_release-3 $dl_dir/tedlium3
  fi
  # If you have pre-downloaded it to /path/to/musan,
  # you can create a symlink
  #
  #ln -sfv /path/to/musan $dl_dir/musan
  if [ ! -d $dl_dir/musan ]; then
    lhotse download musan $dl_dir
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare tedlium3 manifests"
  if [ ! -f data/manifests/.tedlium3.done ]; then
    # We assume that you have downloaded the tedlium3 corpus
    # to $dl_dir/tedlium3
    mkdir -p data/manifests
    lhotse prepare tedlium $dl_dir/tedlium3 data/manifests
    touch data/manifests/.tedlium3.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Prepare musan manifests"
  # We assume that you have downloaded the musan corpus
  # to data/musan
  if [ ! -e data/manifests/.musan.done ]; then
    mkdir -p data/manifests
    lhotse prepare musan $dl_dir/musan data/manifests
    touch data/manifests/.musan.done
  fi
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Compute fbank for tedlium3"
  if [ ! -e data/fbank/.tedlium3.done ]; then
    mkdir -p data/fbank
    python3 ./local/compute_fbank_tedlium.py
    touch data/fbank/.tedlium3.done
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  if [ ! -e data/fbank/.musan.done ]; then
    mkdir -p data/fbank
    python3 ./local/compute_fbank_musan.py
    touch data/fbank/.musan.done
  fi
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Prepare phone based lang"
  lang_dir=data/lang_phone
  mkdir -p $lang_dir
  if [ ! -f $lang_dir/train.text ]; then
    ./local/prepare_transcripts.py \
      --lang-dir $lang_dir \
      --manifests-dir data/manifests
  fi
  if [ ! -f $lang_dir/lexicon_words.txt ]; then
    ./local/prepare_lexicon.py \
      --lang-dir $lang_dir \
      --manifests-dir data/manifests
  fi
  (echo '!SIL SIL'; echo '<UNK> <UNK>'; ) |
    cat - $lang_dir/lexicon_words.txt |
    sort | uniq > $lang_dir/lexicon.txt
  if [ ! -f $lang_dir/L_disambig.pt ]; then
    ./local/prepare_lang.py --lang-dir $lang_dir
  fi
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Prepare BPE based lang"
  for vocab_size in ${vocab_sizes[@]}; do
    lang_dir=data/lang_bpe_${vocab_size}
    mkdir -p $lang_dir
    # We reuse words.txt from phone based lexicon
    # so that the two can share G.pt later.
    cp data/lang_phone/words.txt $lang_dir
    if [ ! -f $lang_dir/transcript_words.txt ]; then
      log "Generate data for BPE training"
      cat data/lang_phone/train.text |
      cut -d " " -f 2- > $lang_dir/transcript_words.txt
      # remove the <unk> for transcript_words.txt
      sed -i 's/ <unk>//g' $lang_dir/transcript_words.txt
      sed -i 's/<unk> //g' $lang_dir/transcript_words.txt
      sed -i 's/<unk>//g' $lang_dir/transcript_words.txt
    fi
    ./local/train_bpe_model.py \
      --lang-dir $lang_dir \
      --vocab-size $vocab_size \
      --transcript $lang_dir/transcript_words.txt
    if [ ! -f $lang_dir/L_disambig.pt ]; then
      ./local/prepare_lang_bpe.py --lang-dir $lang_dir
    fi
  done
 fi
--- a/egs/tedlium3/ASR/shared
+++ b/egs/tedlium3/ASR/shared
@ -0,0 +1 @@
 ../../../icefall/shared/
--- a/egs/tedlium3/ASR/transducer_stateless/README.md
+++ b/egs/tedlium3/ASR/transducer_stateless/README.md
@ -0,0 +1,20 @@
 ## Introduction
 The decoder, i.e., the prediction network, is from
 https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
 (Rnn-Transducer with Stateless Prediction Network)
 You can use the following command to start the training:
 ```bash
 cd egs/tedlium3/ASR
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./transducer_stateless/train.py \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 0 \
  --exp-dir transducer_stateless/exp \
  --max-duration 200
 ```
--- a/egs/tedlium3/ASR/transducer_stateless/init.py
+++ b/egs/tedlium3/ASR/transducer_stateless/init.py
--- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
+++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
@ -0,0 +1,363 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2021  Xiaomi Corporation (Author: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 class TedLiumAsrDataModule:
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. TEDLium3 dev
    and test).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the BucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
        group.add_argument(
            "--enable-spec-aug",
            type=str2bool,
            default=True,
            help="When enabled, use SpecAugment for training dataset.",
        )
        group.add_argument(
            "--spec-aug-time-warp-factor",
            type=int,
            default=80,
            help="Used only when --enable-spec-aug is True. "
            "It specifies the factor for time warping in SpecAugment. "
            "Larger values mean more warping. "
            "A value less than 1 means to disable time warp.",
        )
        group.add_argument(
            "--enable-musan",
            type=str2bool,
            default=True,
            help="When enabled, select noise from MUSAN and mix it"
            "with training dataset. ",
        )
    def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
        logging.info("About to get Musan cuts")
        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            cuts_musan = load_manifest(
                self.args.manifest_dir / "cuts_musan.json.gz"
            )
            transforms.append(
                CutMix(
                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
                )
            )
        else:
            logging.info("Disable MUSAN")
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        input_transforms = []
        if self.args.enable_spec_aug:
            logging.info("Enable SpecAugment")
            logging.info(
                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
            )
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
                    max_frames_mask_fraction=0.15,
                    p=0.9,
                )
            )
        else:
            logging.info("Disable SpecAugment")
        logging.info("About to create train dataset")
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                input_transforms=input_transforms,
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using BucketingSampler.")
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
            logging.info("Using SingleCutSampler.")
            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
        )
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = BucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
        sampler = BucketingSampler(
            cuts, max_duration=self.args.max_duration, shuffle=False
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=sampler,
            num_workers=self.args.num_workers,
        )
        return test_dl
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
--- a/egs/tedlium3/ASR/transducer_stateless/beam_search.py
+++ b/egs/tedlium3/ASR/transducer_stateless/beam_search.py
@ -0,0 +1,545 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
 #                                                  Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 import torch
 from model import Transducer
 def greedy_search(
    model: Transducer, encoder_out: torch.Tensor, max_sym_per_frame: int
 ) -> List[int]:
    """
    Args:
      model:
        An instance of `Transducer`.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
      max_sym_per_frame:
        Maximum number of symbols per frame. If it is set to 0, the WER
        would be 100%.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3
    # support only batch_size == 1 for now
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    blank_id = model.decoder.blank_id
    unk_id = model.decoder.unk_id
    context_size = model.decoder.context_size
    device = model.device
    decoder_input = torch.tensor(
        [blank_id] * context_size, device=device, dtype=torch.int64
    ).reshape(1, context_size)
    decoder_out = model.decoder(decoder_input, need_pad=False)
    T = encoder_out.size(1)
    t = 0
    hyp = [blank_id] * context_size
    # Maximum symbols per utterance.
    max_sym_per_utt = 1000
    # symbols per frame
    sym_per_frame = 0
    # symbols per utterance decoded so far
    sym_per_utt = 0
    encoder_out_len = torch.tensor([1])
    decoder_out_len = torch.tensor([1])
    while t < T and sym_per_utt < max_sym_per_utt:
        if sym_per_frame >= max_sym_per_frame:
            sym_per_frame = 0
            t += 1
            continue
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :]
        # fmt: on
        logits = model.joiner(
            current_encoder_out, decoder_out, encoder_out_len, decoder_out_len
        )
        # logits is (1, 1, 1, vocab_size)
        y = logits.argmax().item()
        if y != blank_id and y != unk_id:
            hyp.append(y)
            decoder_input = torch.tensor(
                [hyp[-context_size:]], device=device
            ).reshape(1, context_size)
            decoder_out = model.decoder(decoder_input, need_pad=False)
            sym_per_utt += 1
            sym_per_frame += 1
        else:
            sym_per_frame = 0
            t += 1
    hyp = hyp[context_size:]  # remove blanks
    return hyp
@dataclass
 class Hypothesis:
    # The predicted tokens so far.
    # Newly predicted tokens are appended to `ys`.
    ys: List[int]
    # The log prob of ys.
    # It contains only one entry.
    log_prob: torch.Tensor
    @property
    def key(self) -> str:
        """Return a string representation of self.ys"""
        return "_".join(map(str, self.ys))
 class HypothesisList(object):
    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
        """
        Args:
          data:
            A dict of Hypotheses. Its key is its `value.key`.
        """
        if data is None:
            self._data = {}
        else:
            self._data = data
    @property
    def data(self) -> Dict[str, Hypothesis]:
        return self._data
    def add(self, hyp: Hypothesis) -> None:
        """Add a Hypothesis to `self`.
        If `hyp` already exists in `self`, its probability is updated using
        `log-sum-exp` with the existed one.
        Args:
          hyp:
            The hypothesis to be added.
        """
        key = hyp.key
        if key in self:
            old_hyp = self._data[key]  # shallow copy
            torch.logaddexp(
                old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob
            )
        else:
            self._data[key] = hyp
    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
        """Get the most probable hypothesis, i.e., the one with
        the largest `log_prob`.
        Args:
          length_norm:
            If True, the `log_prob` of a hypothesis is normalized by the
            number of tokens in it.
        Returns:
          Return the hypothesis that has the largest `log_prob`.
        """
        if length_norm:
            return max(
                self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys)
            )
        else:
            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
    def remove(self, hyp: Hypothesis) -> None:
        """Remove a given hypothesis.
        Caution:
          `self` is modified **in-place**.
        Args:
          hyp:
            The hypothesis to be removed from `self`.
            Note: It must be contained in `self`. Otherwise,
            an exception is raised.
        """
        key = hyp.key
        assert key in self, f"{key} does not exist"
        del self._data[key]
    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
        """Remove all Hypotheses whose log_prob is less than threshold.
        Caution:
          `self` is not modified. Instead, a new HypothesisList is returned.
        Returns:
          Return a new HypothesisList containing all hypotheses from `self`
          with `log_prob` being greater than the given `threshold`.
        """
        ans = HypothesisList()
        for _, hyp in self._data.items():
            if hyp.log_prob > threshold:
                ans.add(hyp)  # shallow copy
        return ans
    def topk(self, k: int) -> "HypothesisList":
        """Return the top-k hypothesis."""
        hyps = list(self._data.items())
        hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
        ans = HypothesisList(dict(hyps))
        return ans
    def __contains__(self, key: str):
        return key in self._data
    def __iter__(self):
        return iter(self._data.values())
    def __len__(self) -> int:
        return len(self._data)
    def __str__(self) -> str:
        s = []
        for key in self:
            s.append(key)
        return ", ".join(s)
 def run_decoder(
    ys: List[int],
    model: Transducer,
    decoder_cache: Dict[str, torch.Tensor],
 ) -> torch.Tensor:
    """Run the neural decoder model for a given hypothesis.
    Args:
      ys:
        The current hypothesis.
      model:
        The transducer model.
      decoder_cache:
        Cache to save computations.
    Returns:
      Return a 1-D tensor of shape (decoder_out_dim,) containing
      output of `model.decoder`.
    """
    context_size = model.decoder.context_size
    key = "_".join(map(str, ys[-context_size:]))
    if key in decoder_cache:
        return decoder_cache[key]
    device = model.device
    decoder_input = torch.tensor([ys[-context_size:]], device=device).reshape(
        1, context_size
    )
    decoder_out = model.decoder(decoder_input, need_pad=False)
    decoder_cache[key] = decoder_out
    return decoder_out
 def run_joiner(
    key: str,
    model: Transducer,
    encoder_out: torch.Tensor,
    decoder_out: torch.Tensor,
    encoder_out_len: torch.Tensor,
    decoder_out_len: torch.Tensor,
    joint_cache: Dict[str, torch.Tensor],
 ):
    """Run the joint network given outputs from the encoder and decoder.
    Args:
      key:
        A key into the `joint_cache`.
      model:
        The transducer model.
      encoder_out:
        A tensor of shape (1, 1, encoder_out_dim).
      decoder_out:
        A tensor of shape (1, 1, decoder_out_dim).
      encoder_out_len:
        A tensor with value [1].
      decoder_out_len:
        A tensor with value [1].
      joint_cache:
        A dict to save computations.
    Returns:
      Return a tensor from the output of log-softmax.
      Its shape is (vocab_size,).
    """
    if key in joint_cache:
        return joint_cache[key]
    logits = model.joiner(
        encoder_out,
        decoder_out,
        encoder_out_len,
        decoder_out_len,
    )
    # TODO(fangjun): Scale the blank posterior
    log_prob = logits.log_softmax(dim=-1)
    # log_prob is (1, 1, 1, vocab_size)
    log_prob = log_prob.squeeze()
    # Now log_prob is (vocab_size,)
    joint_cache[key] = log_prob
    return log_prob
 def modified_beam_search(
    model: Transducer,
    encoder_out: torch.Tensor,
    beam: int = 4,
 ) -> List[int]:
    """It limits the maximum number of symbols per frame to 1.
    Args:
      model:
        An instance of `Transducer`.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
      beam:
        Beam size.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3
    # support only batch_size == 1 for now
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    blank_id = model.decoder.blank_id
    unk_id = model.decoder.unk_id
    context_size = model.decoder.context_size
    device = model.device
    decoder_input = torch.tensor(
        [blank_id] * context_size, device=device
    ).reshape(1, context_size)
    decoder_out = model.decoder(decoder_input, need_pad=False)
    T = encoder_out.size(1)
    B = HypothesisList()
    B.add(
        Hypothesis(
            ys=[blank_id] * context_size,
            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
        )
    )
    encoder_out_len = torch.tensor([1])
    decoder_out_len = torch.tensor([1])
    for t in range(T):
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :]
        # current_encoder_out is of shape (1, 1, encoder_out_dim)
        # fmt: on
        A = list(B)
        B = HypothesisList()
        ys_log_probs = torch.cat([hyp.log_prob.reshape(1, 1) for hyp in A])
        # ys_log_probs is of shape (num_hyps, 1)
        decoder_input = torch.tensor(
            [hyp.ys[-context_size:] for hyp in A],
            device=device,
        )
        # decoder_input is of shape (num_hyps, context_size)
        decoder_out = model.decoder(decoder_input, need_pad=False)
        # decoder_output is of shape (num_hyps, 1, decoder_output_dim)
        current_encoder_out = current_encoder_out.expand(
            decoder_out.size(0), 1, -1
        )
        logits = model.joiner(
            current_encoder_out,
            decoder_out,
            encoder_out_len.expand(decoder_out.size(0)),
            decoder_out_len.expand(decoder_out.size(0)),
        )
        # logits is of shape (num_hyps, vocab_size)
        log_probs = logits.log_softmax(dim=-1)
        log_probs.add_(ys_log_probs)
        log_probs = log_probs.reshape(-1)
        topk_log_probs, topk_indexes = log_probs.topk(beam)
        # topk_hyp_indexes are indexes into `A`
        topk_hyp_indexes = topk_indexes // logits.size(-1)
        topk_token_indexes = topk_indexes % logits.size(-1)
        topk_hyp_indexes = topk_hyp_indexes.tolist()
        topk_token_indexes = topk_token_indexes.tolist()
        for i in range(len(topk_hyp_indexes)):
            hyp = A[topk_hyp_indexes[i]]
            new_ys = hyp.ys[:]
            new_token = topk_token_indexes[i]
            if new_token != blank_id and new_token != unk_id:
                new_ys.append(new_token)
            new_log_prob = topk_log_probs[i]
            new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
            B.add(new_hyp)
    best_hyp = B.get_most_probable(length_norm=True)
    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
    return ys
 def beam_search(
    model: Transducer,
    encoder_out: torch.Tensor,
    beam: int = 4,
 ) -> List[int]:
    """
    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
    espnet/nets/beam_search_transducer.py#L247 is used as a reference.
    Args:
      model:
        An instance of `Transducer`.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
      beam:
        Beam size.
    Returns:
      Return the decoded result.
    """
    assert encoder_out.ndim == 3
    # support only batch_size == 1 for now
    assert encoder_out.size(0) == 1, encoder_out.size(0)
    blank_id = model.decoder.blank_id
    unk_id = model.decoder.unk_id
    context_size = model.decoder.context_size
    device = model.device
    decoder_input = torch.tensor(
        [blank_id] * context_size, device=device
    ).reshape(1, context_size)
    decoder_out = model.decoder(decoder_input, need_pad=False)
    T = encoder_out.size(1)
    t = 0
    B = HypothesisList()
    B.add(
        Hypothesis(
            ys=[blank_id] * context_size,
            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
        )
    )
    max_sym_per_utt = 20000
    sym_per_utt = 0
    encoder_out_len = torch.tensor([1])
    decoder_out_len = torch.tensor([1])
    decoder_cache: Dict[str, torch.Tensor] = {}
    while t < T and sym_per_utt < max_sym_per_utt:
        # fmt: off
        current_encoder_out = encoder_out[:, t:t+1, :]
        # fmt: on
        A = B
        B = HypothesisList()
        joint_cache: Dict[str, torch.Tensor] = {}
        while True:
            y_star = A.get_most_probable()
            A.remove(y_star)
            decoder_out = run_decoder(
                ys=y_star.ys, model=model, decoder_cache=decoder_cache
            )
            key = "_".join(map(str, y_star.ys[-context_size:]))
            key += f"-t-{t}"
            log_prob = run_joiner(
                key=key,
                model=model,
                encoder_out=current_encoder_out,
                decoder_out=decoder_out,
                encoder_out_len=encoder_out_len,
                decoder_out_len=decoder_out_len,
                joint_cache=joint_cache,
            )
            # First, process the blank symbol
            skip_log_prob = log_prob[blank_id]
            new_y_star_log_prob = y_star.log_prob + skip_log_prob
            # ys[:] returns a copy of ys
            B.add(Hypothesis(ys=y_star.ys[:], log_prob=new_y_star_log_prob))
            # Second, process other non-blank labels
            values, indices = log_prob.topk(beam + 1)
            for idx in range(values.size(0)):
                i = indices[idx].item()
                if i == blank_id or i == unk_id:
                    continue
                new_ys = y_star.ys + [i]
                new_log_prob = y_star.log_prob + values[idx]
                A.add(Hypothesis(ys=new_ys, log_prob=new_log_prob))
            # Check whether B contains more than "beam" elements more probable
            # than the most probable in A
            A_most_probable = A.get_most_probable()
            kept_B = B.filter(A_most_probable.log_prob)
            if len(kept_B) >= beam:
                B = kept_B.topk(beam)
                break
        t += 1
    best_hyp = B.get_most_probable(length_norm=True)
    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
    return ys
--- a/egs/tedlium3/ASR/transducer_stateless/conformer.py
+++ b/egs/tedlium3/ASR/transducer_stateless/conformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/conformer.py
--- a/egs/tedlium3/ASR/transducer_stateless/decode.py
+++ b/egs/tedlium3/ASR/transducer_stateless/decode.py
@ -0,0 +1,496 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang
 #                                            Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 (1) greedy search
 ./transducer_stateless/decode.py \
        --epoch 29 \
        --avg 16 \
        --exp-dir ./transducer_stateless/exp \
        --max-duration 100 \
        --decoding-method greedy_search
 (2) beam search
 ./transducer_stateless/decode.py \
        --epoch 29 \
        --avg 16 \
        --exp-dir ./transducer_stateless/exp \
        --max-duration 100 \
        --decoding-method beam_search \
        --beam-size 4
 (3) modified beam search
 ./transducer_stateless/decode.py \
        --epoch 29 \
        --avg 16 \
        --exp-dir ./transducer_stateless/exp \
        --max-duration 100 \
        --decoding-method modified_beam_search \
        --beam-size 4
 """
 import argparse
 import logging
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Tuple
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import TedLiumAsrDataModule
 from beam_search import beam_search, greedy_search, modified_beam_search
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
 from model import Transducer
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    write_error_stats,
 )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=29,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=13,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="transducer_stateless/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
        """,
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="""Used only when --decoding-method is
        beam_search or modified_beam_search""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=3,
        help="""Maximum number of symbols per frame.
        Used only when --decoding_method is greedy_search""",
    )
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            # parameters for conformer
            "feature_dim": 80,
            "encoder_out_dim": 512,
            "subsampling_factor": 4,
            "attention_dim": 512,
            "nhead": 8,
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            "env_info": get_env_info(),
        }
    )
    return params
 def get_encoder_model(params: AttributeDict):
    # TODO: We can add an option to switch between Conformer and Transformer
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.encoder_out_dim,
        subsampling_factor=params.subsampling_factor,
        d_model=params.attention_dim,
        nhead=params.nhead,
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
    )
    return encoder
 def get_decoder_model(params: AttributeDict):
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.encoder_out_dim,
        blank_id=params.blank_id,
        unk_id=params.unk_id,
        context_size=params.context_size,
    )
    return decoder
 def get_joiner_model(params: AttributeDict):
    joiner = Joiner(
        input_dim=params.encoder_out_dim,
        output_dim=params.vocab_size,
    )
    return joiner
 def get_transducer_model(params: AttributeDict):
    encoder = get_encoder_model(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder=encoder,
        decoder=decoder,
        joiner=joiner,
    )
    return model
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    batch: dict,
 ) -> Dict[str, List[List[str]]]:
    """Decode one batch and return the result in a dict. The dict has the
    following format:
        - key: It indicates the setting used for decoding. For example,
               if greedy_search is used, it would be "greedy_search"
               If beam search with a beam size of 7 is used, it would be
               "beam_7"
        - value: It contains the decoding result. `len(value)` equals to
                 batch size. `value[i]` is the decoding result for the i-th
                 utterance in the given batch.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      sp:
        The BPE model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
    """
    device = model.device
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device)
    # at entry, feature is (N, T, C)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    encoder_out, encoder_out_lens = model.encoder(
        x=feature, x_lens=feature_lens
    )
    hyps = []
    batch_size = encoder_out.size(0)
    for i in range(batch_size):
        # fmt: off
        encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
        # fmt: on
        if params.decoding_method == "greedy_search":
            hyp = greedy_search(
                model=model,
                encoder_out=encoder_out_i,
                max_sym_per_frame=params.max_sym_per_frame,
            )
        elif params.decoding_method == "beam_search":
            hyp = beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        elif params.decoding_method == "modified_beam_search":
            hyp = modified_beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        else:
            raise ValueError(
                f"Unsupported decoding method: {params.decoding_method}"
            )
        hyps.append(sp.decode(hyp).split())
    if params.decoding_method == "greedy_search":
        return {"greedy_search": hyps}
    else:
        return {f"beam_{params.beam_size}": hyps}
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      sp:
        The BPE model.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    if params.decoding_method == "greedy_search":
        log_interval = 100
    else:
        log_interval = 2
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        hyps_dict = decode_one_batch(
            params=params,
            model=model,
            sp=sp,
            batch=batch,
        )
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
            for hyp_words, ref_text in zip(hyps, texts):
                ref_words = ref_text.split()
                this_batch.append((ref_words, hyp_words))
            results[name].extend(this_batch)
        num_cuts += len(texts)
        if batch_idx % log_interval == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(
                f"batch {batch_str}, cuts processed until now is {num_cuts}"
            )
    return results
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir
        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    TedLiumAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    assert params.decoding_method in (
        "greedy_search",
        "beam_search",
        "modified_beam_search",
    )
    params.res_dir = params.exp_dir / params.decoding_method
    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    if "beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam_size}"
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> and <unk> are defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    if params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
    model.to(device)
    model.eval()
    model.device = device
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    tedlium = TedLiumAsrDataModule(args)
    dev_cuts = tedlium.dev_cuts()
    test_cuts = tedlium.test_cuts()
    dev_dl = tedlium.valid_dataloaders(dev_cuts)
    test_dl = tedlium.test_dataloaders(test_cuts)
    test_sets = ["dev", "test"]
    test_dl = [dev_dl, test_dl]
    for test_set, test_dl in zip(test_sets, test_dl):
        results_dict = decode_dataset(
            dl=test_dl,
            params=params,
            model=model,
            sp=sp,
        )
        save_results(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
    logging.info("Done!")
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/tedlium3/ASR/transducer_stateless/decoder.py
+++ b/egs/tedlium3/ASR/transducer_stateless/decoder.py
@ -0,0 +1,102 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
 #                                                  Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class Decoder(nn.Module):
    """This class modifies the stateless decoder from the following paper:
        RNN-transducer with stateless prediction network
        https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
    It removes the recurrent connection from the decoder, i.e., the prediction
    network. Different from the above paper, it adds an extra Conv1d
    right after the embedding layer.
    TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
    """
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        blank_id: int,
        unk_id: int,
        context_size: int,
    ):
        """
        Args:
          vocab_size:
            Number of tokens of the modeling unit including blank.
          embedding_dim:
            Dimension of the input embedding.
          blank_id:
            The ID of the blank symbol.
          unk_id:
            The ID of the unk symbol.
          context_size:
            Number of previous words to use to predict the next word.
            1 means bigram; 2 means trigram. n means (n+1)-gram.
        """
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=blank_id,
        )
        self.blank_id = blank_id
        self.unk_id = unk_id
        assert context_size >= 1, context_size
        self.context_size = context_size
        if context_size > 1:
            self.conv = nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=embedding_dim,
                kernel_size=context_size,
                padding=0,
                groups=embedding_dim,
                bias=False,
            )
    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
        """
        Args:
          y:
            A 2-D tensor of shape (N, U).
          need_pad:
            True to left pad the input. Should be True during training.
            False to not pad the input. Should be False during inference.
        Returns:
          Return a tensor of shape (N, U, embedding_dim).
        """
        embedding_out = self.embedding(y)
        if self.context_size > 1:
            embedding_out = embedding_out.permute(0, 2, 1)
            if need_pad is True:
                embedding_out = F.pad(
                    embedding_out, pad=(self.context_size - 1, 0)
                )
            else:
                # During inference time, there is no need to do extra padding
                # as we only need one output
                assert embedding_out.size(-1) == self.context_size
            embedding_out = self.conv(embedding_out)
            embedding_out = embedding_out.permute(0, 2, 1)
        return embedding_out
--- a/egs/tedlium3/ASR/transducer_stateless/encoder_interface.py
+++ b/egs/tedlium3/ASR/transducer_stateless/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/encoder_interface.py
--- a/egs/tedlium3/ASR/transducer_stateless/export.py
+++ b/egs/tedlium3/ASR/transducer_stateless/export.py
@ -0,0 +1,252 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang
 # 					     Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts several saved checkpoints
 # to a single one using model averaging.
 """
 Usage:
 ./transducer_stateless/export.py \
  --exp-dir ./transducer_stateless/exp \
  --bpe-model data/lang_bpe_500/bpe.model \
  --epoch 29 \
  --avg 16
 It will generate a file exp_dir/pretrained.pt
 To use the generated file with `transducer_stateless/decode.py`, you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/tedlium3/ASR
    ./transducer_stateless/decode.py \
        --exp-dir ./transducer_stateless/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 100 \
        --bpe-model data/lang_bpe_500/bpe.model
 """
 import argparse
 import logging
 from pathlib import Path
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
 from model import Transducer
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
 from icefall.utils import AttributeDict, str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=20,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=10,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="transducer_stateless/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--jit",
        type=str2bool,
        default=False,
        help="""True to save a model after applying torch.jit.script.
        """,
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            # parameters for conformer
            "feature_dim": 80,
            "encoder_out_dim": 512,
            "subsampling_factor": 4,
            "attention_dim": 512,
            "nhead": 8,
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            "env_info": get_env_info(),
        }
    )
    return params
 def get_encoder_model(params: AttributeDict) -> nn.Module:
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.encoder_out_dim,
        subsampling_factor=params.subsampling_factor,
        d_model=params.attention_dim,
        nhead=params.nhead,
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
    )
    return encoder
 def get_decoder_model(params: AttributeDict) -> nn.Module:
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.encoder_out_dim,
        blank_id=params.blank_id,
        unk_id=params.unk_id,
        context_size=params.context_size,
    )
    return decoder
 def get_joiner_model(params: AttributeDict) -> nn.Module:
    joiner = Joiner(
        input_dim=params.encoder_out_dim,
        output_dim=params.vocab_size,
    )
    return joiner
 def get_transducer_model(params: AttributeDict) -> nn.Module:
    encoder = get_encoder_model(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder=encoder,
        decoder=decoder,
        joiner=joiner,
    )
    return model
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
    assert args.jit is False, "Support torchscript will be added later"
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> and <unk> are defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    model.to(device)
    if params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
    model.eval()
    model.to("cpu")
    model.eval()
    if params.jit:
        logging.info("Using torch.jit.script")
        model = torch.jit.script(model)
        filename = params.exp_dir / "cpu_jit.pt"
        model.save(str(filename))
        logging.info(f"Saved to {filename}")
    else:
        logging.info("Not using torch.jit.script")
        # Save it using a format so that it can be loaded
        # by :func:`load_checkpoint`
        filename = params.exp_dir / "pretrained.pt"
        torch.save({"model": model.state_dict()}, str(filename))
        logging.info(f"Saved to {filename}")
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/tedlium3/ASR/transducer_stateless/joiner.py
+++ b/egs/tedlium3/ASR/transducer_stateless/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/joiner.py
--- a/egs/tedlium3/ASR/transducer_stateless/model.py
+++ b/egs/tedlium3/ASR/transducer_stateless/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/model.py
--- a/egs/tedlium3/ASR/transducer_stateless/pretrained.py
+++ b/egs/tedlium3/ASR/transducer_stateless/pretrained.py
@ -0,0 +1,343 @@
 #!/usr/bin/env python3
 # Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 # 		 2022  Xiaomi Crop.        (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 (1) greedy search
 ./transducer_stateless/pretrained.py \
        --checkpoint ./transducer_stateless/exp/pretrained.pt \
        --bpe-model ./data/lang_bpe_500/bpe.model \
        --method greedy_search \
        --max-sym-per-frame 1 \
        /path/to/foo.wav \
        /path/to/bar.wav
 (2) beam search
 ./transducer_stateless/pretrained.py \
        --checkpoint ./transducer_stateless/exp/pretrained.pt \
        --bpe-model ./data/lang_bpe_500/bpe.model \
        --method beam_search \
        --beam-size 4 \
        /path/to/foo.wav \
        /path/to/bar.wav
 (3) modified beam search
 ./transducer_stateless/pretrained.py \
        --checkpoint ./transducer_stateless/exp/pretrained.pt \
        --bpe-model ./data/lang_bpe_500/bpe.model \
        --method modified_beam_search \
        --beam-size 4 \
        /path/to/foo.wav \
        /path/to/bar.wav
 You can also use `./transducer_stateless/exp/epoch-xx.pt`.
 Note: ./transducer_stateless/exp/pretrained.pt is generated by
 ./transducer_stateless/export.py
 """
 import argparse
 import logging
 import math
 from typing import List
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 import torchaudio
 from beam_search import beam_search, greedy_search, modified_beam_search
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
 from model import Transducer
 from torch.nn.utils.rnn import pad_sequence
 from icefall.env import get_env_info
 from icefall.utils import AttributeDict
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        required=True,
        help="Path to the checkpoint. "
        "The checkpoint is assumed to be saved by "
        "icefall.checkpoint.save_checkpoint().",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        help="""Path to bpe.model.
        Used only when method is ctc-decoding.
        """,
    )
    parser.add_argument(
        "--method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
        """,
    )
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. "
        "The sample rate has to be 16kHz.",
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="Used only when --method is beam_search and modified_beam_search ",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=3,
        help="""Maximum number of symbols per frame. Used only when
        --method is greedy_search.
        """,
    )
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            "sample_rate": 16000,
            # parameters for conformer
            "feature_dim": 80,
            "encoder_out_dim": 512,
            "subsampling_factor": 4,
            "attention_dim": 512,
            "nhead": 8,
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            "env_info": get_env_info(),
        }
    )
    return params
 def get_encoder_model(params: AttributeDict) -> nn.Module:
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.encoder_out_dim,
        subsampling_factor=params.subsampling_factor,
        d_model=params.attention_dim,
        nhead=params.nhead,
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
    )
    return encoder
 def get_decoder_model(params: AttributeDict) -> nn.Module:
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.encoder_out_dim,
        blank_id=params.blank_id,
        unk_id=params.unk_id,
        context_size=params.context_size,
    )
    return decoder
 def get_joiner_model(params: AttributeDict) -> nn.Module:
    joiner = Joiner(
        input_dim=params.encoder_out_dim,
        output_dim=params.vocab_size,
    )
    return joiner
 def get_transducer_model(params: AttributeDict) -> nn.Module:
    encoder = get_encoder_model(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder=encoder,
        decoder=decoder,
        joiner=joiner,
    )
    return model
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        assert sample_rate == expected_sample_rate, (
            f"expected sample rate: {expected_sample_rate}. "
            f"Given: {sample_rate}"
        )
        # We use only the first channel
        ans.append(wave[0])
    return ans
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
    params = get_params()
    params.update(vars(args))
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> and <unk> are defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(f"{params}")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    logging.info("Creating model")
    model = get_transducer_model(params)
    checkpoint = torch.load(args.checkpoint, map_location="cpu")
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
    model.device = device
    logging.info("Constructing Fbank computer")
    opts = kaldifeat.FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = params.sample_rate
    opts.mel_opts.num_bins = params.feature_dim
    fbank = kaldifeat.Fbank(opts)
    logging.info(f"Reading sound files: {params.sound_files}")
    waves = read_sound_files(
        filenames=params.sound_files, expected_sample_rate=params.sample_rate
    )
    waves = [w.to(device) for w in waves]
    logging.info("Decoding started")
    features = fbank(waves)
    feature_lengths = [f.size(0) for f in features]
    features = pad_sequence(
        features, batch_first=True, padding_value=math.log(1e-10)
    )
    feature_lengths = torch.tensor(feature_lengths, device=device)
    with torch.no_grad():
        encoder_out, encoder_out_lens = model.encoder(
            x=features, x_lens=feature_lengths
        )
    num_waves = encoder_out.size(0)
    hyps = []
    msg = f"Using {params.method}"
    if params.method == "beam_search":
        msg += f" with beam size {params.beam_size}"
    logging.info(msg)
    for i in range(num_waves):
        # fmt: off
        encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
        # fmt: on
        if params.method == "greedy_search":
            hyp = greedy_search(
                model=model,
                encoder_out=encoder_out_i,
                max_sym_per_frame=params.max_sym_per_frame,
            )
        elif params.method == "beam_search":
            hyp = beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        elif params.method == "modified_beam_search":
            hyp = modified_beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        else:
            raise ValueError(f"Unsupported method: {params.method}")
        hyps.append(sp.decode(hyp).split())
    s = "\n"
    for filename, hyp in zip(params.sound_files, hyps):
        words = " ".join(hyp)
        s += f"{filename}:\n{words}\n\n"
    logging.info(s)
    logging.info("Decoding Done")
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/tedlium3/ASR/transducer_stateless/subsampling.py
+++ b/egs/tedlium3/ASR/transducer_stateless/subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/subsampling.py
--- a/egs/tedlium3/ASR/transducer_stateless/test_decoder.py
+++ b/egs/tedlium3/ASR/transducer_stateless/test_decoder.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
 # 						   Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 To run this file, do:
    cd icefall/egs/tedlium3/ASR
    python ./transducer_stateless/test_decoder.py
 """
 import torch
 from decoder import Decoder
 def test_decoder():
    vocab_size = 3
    blank_id = 0
    unk_id = 2
    embedding_dim = 128
    context_size = 4
    decoder = Decoder(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        blank_id=blank_id,
        unk_id=unk_id,
        context_size=context_size,
    )
    N = 100
    U = 20
    x = torch.randint(low=0, high=vocab_size, size=(N, U))
    y = decoder(x)
    assert y.shape == (N, U, embedding_dim)
    # for inference
    x = torch.randint(low=0, high=vocab_size, size=(N, context_size))
    y = decoder(x, need_pad=False)
    assert y.shape == (N, 1, embedding_dim)
 def main():
    test_decoder()
 if __name__ == "__main__":
    main()
--- a/egs/tedlium3/ASR/transducer_stateless/train.py
+++ b/egs/tedlium3/ASR/transducer_stateless/train.py
@ -0,0 +1,752 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
 #                                                  Wei Kang
 #                                                  Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./transducer_stateless/train.py \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 0 \
  --exp-dir transducer_stateless/exp \
  --max-duration 200
 """
 import argparse
 import logging
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
 import k2
 import sentencepiece as spm
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import TedLiumAsrDataModule
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from local.convert_transcript_words_to_bpe_ids import convert_texts_into_ids
 from model import Transducer
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
 from torch.utils.tensorboard import SummaryWriter
 from transformer import Noam
 from icefall.checkpoint import load_checkpoint
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--world-size",
        type=int,
        default=1,
        help="Number of GPUs for DDP training.",
    )
    parser.add_argument(
        "--master-port",
        type=int,
        default=12354,
        help="Master port to use for DDP training.",
    )
    parser.add_argument(
        "--tensorboard",
        type=str2bool,
        default=True,
        help="Should various information be logged in tensorboard.",
    )
    parser.add_argument(
        "--num-epochs",
        type=int,
        default=30,
        help="Number of epochs to train.",
    )
    parser.add_argument(
        "--start-epoch",
        type=int,
        default=0,
        help="""Resume training from from this epoch.
        If it is positive, it will load checkpoint from
        transducer_stateless/exp/epoch-{start_epoch-1}.pt
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="transducer_stateless/exp",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--lr-factor",
        type=float,
        default=5.0,
        help="The lr_factor for Noam optimizer",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    parser.add_argument(
        "--modified-transducer-prob",
        type=float,
        default=0.25,
        help="""The probability to use modified transducer loss.
        In modified transduer, it limits the maximum number of symbols
        per frame to 1. See also the option --max-sym-per-frame in
        transducer_stateless/decode.py
        """,
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="The seed for random generators intended for reproducibility",
    )
    return parser
 def get_params() -> AttributeDict:
    """Return a dict containing training parameters.
    All training related parameters that are not passed from the commandline
    are saved in the variable `params`.
    Commandline options are merged into `params` after they are parsed, so
    you can also access them via `params`.
    Explanation of options saved in `params`:
        - best_train_loss: Best training loss so far. It is used to select
                           the model that has the lowest training loss. It is
                           updated during the training.
        - best_valid_loss: Best validation loss so far. It is used to select
                           the model that has the lowest validation loss. It is
                           updated during the training.
        - best_train_epoch: It is the epoch that has the best training loss.
        - best_valid_epoch: It is the epoch that has the best validation loss.
        - batch_idx_train: Used to writing statistics to tensorboard. It
                           contains number of batches trained so far across
                           epochs.
        - log_interval:  Print training loss if batch_idx % log_interval` is 0
        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
        - valid_interval:  Run validation if batch_idx % valid_interval is 0
        - feature_dim: The model input dim. It has to match the one used
                       in computing features.
        - subsampling_factor:  The subsampling factor for the model.
        - attention_dim: Hidden dim for multi-head attention model.
        - num_decoder_layers: Number of decoder layer of transformer decoder.
        - warm_step: The warm_step for Noam optimizer.
    """
    params = AttributeDict(
        {
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
            "best_valid_epoch": -1,
            "batch_idx_train": 0,
            "log_interval": 50,
            "reset_interval": 200,
            "valid_interval": 3000,  # For the 100h subset, use 800
            # parameters for conformer
            "feature_dim": 80,
            "encoder_out_dim": 512,
            "subsampling_factor": 4,
            "attention_dim": 512,
            "nhead": 8,
            "dim_feedforward": 2048,
            "num_encoder_layers": 12,
            "vgg_frontend": False,
            # parameters for Noam
            "warm_step": 80000,  # For the 100h subset, use 8k
            "env_info": get_env_info(),
        }
    )
    return params
 def get_encoder_model(params: AttributeDict) -> nn.Module:
    # TODO: We can add an option to switch between Conformer and Transformer
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.encoder_out_dim,
        subsampling_factor=params.subsampling_factor,
        d_model=params.attention_dim,
        nhead=params.nhead,
        dim_feedforward=params.dim_feedforward,
        num_encoder_layers=params.num_encoder_layers,
        vgg_frontend=params.vgg_frontend,
    )
    return encoder
 def get_decoder_model(params: AttributeDict) -> nn.Module:
    decoder = Decoder(
        vocab_size=params.vocab_size,
        embedding_dim=params.encoder_out_dim,
        blank_id=params.blank_id,
        unk_id=params.unk_id,
        context_size=params.context_size,
    )
    return decoder
 def get_joiner_model(params: AttributeDict) -> nn.Module:
    joiner = Joiner(
        input_dim=params.encoder_out_dim,
        output_dim=params.vocab_size,
    )
    return joiner
 def get_transducer_model(params: AttributeDict) -> nn.Module:
    encoder = get_encoder_model(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder=encoder,
        decoder=decoder,
        joiner=joiner,
    )
    return model
 def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
 ) -> None:
    """Load checkpoint from file.
    If params.start_epoch is positive, it will load the checkpoint from
    `params.start_epoch - 1`. Otherwise, this function does nothing.
    Apart from loading state dict for `model`, `optimizer` and `scheduler`,
    it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
    and `best_valid_loss` in `params`.
    Args:
      params:
        The return value of :func:`get_params`.
      model:
        The training model.
      optimizer:
        The optimizer that we are using.
      scheduler:
        The learning rate scheduler we are using.
    Returns:
      Return None.
    """
    if params.start_epoch <= 0:
        return
    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    saved_params = load_checkpoint(
        filename,
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
    )
    keys = [
        "best_train_epoch",
        "best_valid_epoch",
        "batch_idx_train",
        "best_train_loss",
        "best_valid_loss",
    ]
    for k in keys:
        params[k] = saved_params[k]
    return saved_params
 def save_checkpoint(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The training model.
    """
    if rank != 0:
        return
    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
    save_checkpoint_impl(
        filename=filename,
        model=model,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
        rank=rank,
    )
    if params.best_train_epoch == params.cur_epoch:
        best_train_filename = params.exp_dir / "best-train-loss.pt"
        copyfile(src=filename, dst=best_train_filename)
    if params.best_valid_epoch == params.cur_epoch:
        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
        copyfile(src=filename, dst=best_valid_filename)
 def compute_loss(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    batch: dict,
    is_training: bool,
 ) -> Tuple[Tensor, MetricsTracker]:
    """
    Compute CTC loss given the model and its inputs.
    Args:
      params:
        Parameters for training. See :func:`get_params`.
      model:
        The model for training. It is an instance of Conformer in our case.
      batch:
        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
        for the content in it.
      is_training:
        True for training. False for validation. When it is True, this
        function enables autograd during computation; when it is False, it
        disables autograd.
    """
    device = model.device
    feature = batch["inputs"]
    # at entry, feature is (N, T, C)
    assert feature.ndim == 3
    feature = feature.to(device)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    texts = batch["supervisions"]["text"]
    unk_id = params.unk_id
    y = convert_texts_into_ids(texts, unk_id, sp=sp)
    y = k2.RaggedTensor(y).to(device)
    with torch.set_grad_enabled(is_training):
        loss = model(
            x=feature,
            x_lens=feature_lens,
            y=y,
            modified_transducer_prob=params.modified_transducer_prob,
        )
    assert loss.requires_grad == is_training
    info = MetricsTracker()
    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
    return loss, info
 def compute_validation_loss(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    valid_dl: torch.utils.data.DataLoader,
    world_size: int = 1,
 ) -> MetricsTracker:
    """Run the validation process."""
    model.eval()
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(valid_dl):
        loss, loss_info = compute_loss(
            params=params,
            model=model,
            sp=sp,
            batch=batch,
            is_training=False,
        )
        assert loss.requires_grad is False
        tot_loss = tot_loss + loss_info
    if world_size > 1:
        tot_loss.reduce(loss.device)
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    if loss_value < params.best_valid_loss:
        params.best_valid_epoch = params.cur_epoch
        params.best_valid_loss = loss_value
    return tot_loss
 def train_one_epoch(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
 ) -> None:
    """Train the model for one epoch.
    The training loss from the mean of all frames is saved in
    `params.train_loss`. It runs the validation process every
    `params.valid_interval` batches.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The model for training.
      optimizer:
        The optimizer we are using.
      train_dl:
        Dataloader for the training dataset.
      valid_dl:
        Dataloader for the validation dataset.
      tb_writer:
        Writer to write log messages to tensorboard.
      world_size:
        Number of nodes in DDP training. If it is 1, DDP is disabled.
    """
    model.train()
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(train_dl):
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
        loss, loss_info = compute_loss(
            params=params,
            model=model,
            sp=sp,
            batch=batch,
            is_training=True,
        )
        # summary stats
        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
        # NOTE: We use reduction==sum and loss is computed over utterances
        # in the batch and there is no normalization to it so far.
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 5.0, 2.0)
        optimizer.step()
        if batch_idx % params.log_interval == 0:
            logging.info(
                f"Epoch {params.cur_epoch}, "
                f"batch {batch_idx}, loss[{loss_info}], "
                f"tot_loss[{tot_loss}], batch size: {batch_size}"
            )
        if batch_idx % params.log_interval == 0:
            if tb_writer is not None:
                loss_info.write_summary(
                    tb_writer, "train/current_", params.batch_idx_train
                )
                tot_loss.write_summary(
                    tb_writer, "train/tot_", params.batch_idx_train
                )
        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
            logging.info("Computing validation loss")
            valid_info = compute_validation_loss(
                params=params,
                model=model,
                sp=sp,
                valid_dl=valid_dl,
                world_size=world_size,
            )
            model.train()
            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
            if tb_writer is not None:
                valid_info.write_summary(
                    tb_writer, "train/valid_", params.batch_idx_train
                )
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    params.train_loss = loss_value
    if params.train_loss < params.best_train_loss:
        params.best_train_epoch = params.cur_epoch
        params.best_train_loss = params.train_loss
 def run(rank, world_size, args):
    """
    Args:
      rank:
        It is a value between 0 and `world_size-1`, which is
        passed automatically by `mp.spawn()` in :func:`main`.
        The node with rank 0 is responsible for saving checkpoint.
      world_size:
        Number of GPUs for DDP training.
      args:
        The return value of get_parser().parse_args()
    """
    params = get_params()
    params.update(vars(args))
    fix_random_seed(params.seed)
    if world_size > 1:
        setup_dist(rank, world_size, params.master_port)
    setup_logger(f"{params.exp_dir}/log/log-train")
    logging.info("Training started")
    if args.tensorboard and rank == 0:
        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
    else:
        tb_writer = None
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", rank)
    logging.info(f"Device: {device}")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> and <unk> are defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    checkpoints = load_checkpoint_if_available(params=params, model=model)
    model.to(device)
    if world_size > 1:
        logging.info("Using DDP")
        model = DDP(model, device_ids=[rank])
    model.device = device
    optimizer = Noam(
        model.parameters(),
        model_size=params.attention_dim,
        factor=params.lr_factor,
        warm_step=params.warm_step,
    )
    if checkpoints and "optimizer" in checkpoints:
        logging.info("Loading optimizer state dict")
        optimizer.load_state_dict(checkpoints["optimizer"])
    tedlium = TedLiumAsrDataModule(args)
    train_cuts = tedlium.train_cuts()
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 17 seconds
        return 1.0 <= c.duration <= 17.0
    num_in_total = len(train_cuts)
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
    num_left = len(train_cuts)
    num_removed = num_in_total - num_left
    removed_percent = num_removed / num_in_total * 100
    logging.info(f"Before removing short and long utterances: {num_in_total}")
    logging.info(f"After removing short and long utterances: {num_left}")
    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
    train_dl = tedlium.train_dataloaders(train_cuts)
    valid_cuts = tedlium.dev_cuts()
    valid_dl = tedlium.valid_dataloaders(valid_cuts)
    scan_pessimistic_batches_for_oom(
        model=model,
        train_dl=train_dl,
        optimizer=optimizer,
        sp=sp,
        params=params,
    )
    for epoch in range(params.start_epoch, params.num_epochs):
        fix_random_seed(params.seed + epoch)
        train_dl.sampler.set_epoch(epoch)
        cur_lr = optimizer._rate
        if tb_writer is not None:
            tb_writer.add_scalar(
                "train/learning_rate", cur_lr, params.batch_idx_train
            )
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
        if rank == 0:
            logging.info("epoch {}, learning rate {}".format(epoch, cur_lr))
        params.cur_epoch = epoch
        train_one_epoch(
            params=params,
            model=model,
            optimizer=optimizer,
            sp=sp,
            train_dl=train_dl,
            valid_dl=valid_dl,
            tb_writer=tb_writer,
            world_size=world_size,
        )
        save_checkpoint(
            params=params,
            model=model,
            optimizer=optimizer,
            rank=rank,
        )
    logging.info("Done!")
    if world_size > 1:
        torch.distributed.barrier()
        cleanup_dist()
 def scan_pessimistic_batches_for_oom(
    model: nn.Module,
    train_dl: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    sp: spm.SentencePieceProcessor,
    params: AttributeDict,
 ):
    from lhotse.dataset import find_pessimistic_batches
    logging.info(
        "Sanity check -- see if any of the batches in epoch 0 would cause OOM."
    )
    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
            optimizer.zero_grad()
            loss, _ = compute_loss(
                params=params,
                model=model,
                sp=sp,
                batch=batch,
                is_training=True,
            )
            loss.backward()
            clip_grad_norm_(model.parameters(), 5.0, 2.0)
            optimizer.step()
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                logging.error(
                    "Your GPU ran out of memory with the current "
                    "max_duration setting. We recommend decreasing "
                    "max_duration and trying again.\n"
                    f"Failing criterion: {criterion} "
                    f"(={crit_values[criterion]}) ..."
                )
            raise
 def main():
    parser = get_parser()
    TedLiumAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    world_size = args.world_size
    assert world_size >= 1
    if world_size > 1:
        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
    else:
        run(rank=0, world_size=1, args=args)
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/tedlium3/ASR/transducer_stateless/transformer.py
+++ b/egs/tedlium3/ASR/transducer_stateless/transformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/transformer.py
--- a/egs/timit/ASR/tdnn_ligru_ctc/asr_datamodule.py
+++ b/egs/timit/ASR/tdnn_ligru_ctc/asr_datamodule.py
@ -1,330 +0,0 @@
 # Copyright      2021     Piotr Żelasko
 #                2021     Xiaomi Corp.    (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import List, Union
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.dataset.datamodule import DataModule
 from icefall.utils import str2bool
 class TimitAsrDataModule(DataModule):
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        super().add_arguments(parser)
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--feature-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the BucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
    def train_dataloaders(self) -> DataLoader:
        logging.info("About to get train cuts")
        cuts_train = self.train_cuts()
        logging.info("About to get Musan cuts")
        cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
        logging.info("About to create train dataset")
        transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        input_transforms = [
            SpecAugment(
                num_frame_masks=2,
                features_mask_size=27,
                num_feature_masks=2,
                frames_mask_size=100,
            )
        ]
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                input_transforms=input_transforms,
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using BucketingSampler.")
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
            logging.info("Using SingleCutSampler.")
            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
        )
        return train_dl
    def valid_dataloaders(self) -> DataLoader:
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        cuts = self.test_cuts()
        is_list = isinstance(cuts, list)
        test_loaders = []
        if not is_list:
            cuts = [cuts]
        for cuts_test in cuts:
            logging.debug("About to create test dataset")
            test = K2SpeechRecognitionDataset(
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                )
                if self.args.on_the_fly_feats
                else PrecomputedFeatures(),
                return_cuts=self.args.return_cuts,
            )
            sampler = SingleCutSampler(
                cuts_test, max_duration=self.args.max_duration
            )
            logging.debug("About to create test dataloader")
            test_dl = DataLoader(
                test, batch_size=None, sampler=sampler, num_workers=1
            )
            test_loaders.append(test_dl)
        if is_list:
            return test_loaders
        else:
            return test_loaders[0]
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz")
        return cuts_train
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz")
        return cuts_valid
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.debug("About to get test cuts")
        cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz")
        return cuts_test
--- a/egs/timit/ASR/tdnn_ligru_ctc/asr_datamodule.py
+++ b/egs/timit/ASR/tdnn_ligru_ctc/asr_datamodule.py
@ -0,0 +1 @@
 ../tdnn_lstm_ctc/asr_datamodule.py
--- a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -1,5 +1,5 @@
 # Copyright      2021     Piotr Żelasko
-#                2021     Xiaomi Corp.     (authors: Mingshuang Luo)
+#                2022     Xiaomi Corporation     (Author: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -17,6 +17,7 @@
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
@ -171,9 +172,19 @@ class TimitAsrDataModule(DataModule):
                )
            ] + transforms
        # Set the value of num_frame_masks according to Lhotse's version.
        # In different Lhotse's versions, the default of num_frame_masks is
        # different.
        num_frame_masks = 10
        num_frame_masks_parameter = inspect.signature(
            SpecAugment.__init__
        ).parameters["num_frame_masks"]
        if num_frame_masks_parameter.default == 1:
            num_frame_masks = 2
        logging.info(f"Num frame mask: {num_frame_masks}")
        input_transforms = [
            SpecAugment(
-                num_frame_masks=2,
+                num_frame_masks=num_frame_masks,
                features_mask_size=27,
                num_feature_masks=2,
                frames_mask_size=100,
--- a/icefall/checkpoint.py
+++ b/icefall/checkpoint.py
@ -15,12 +15,16 @@
 # limitations under the License.
 import glob
 import logging
 import os
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 import torch
 import torch.nn as nn
 from lhotse.dataset.sampling.base import CutSampler
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
@ -34,6 +38,7 @@ def save_checkpoint(
    optimizer: Optional[Optimizer] = None,
    scheduler: Optional[_LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    rank: int = 0,
 ) -> None:
    """Save training information to a file.
@ -69,6 +74,7 @@ def save_checkpoint(
        "optimizer": optimizer.state_dict() if optimizer is not None else None,
        "scheduler": scheduler.state_dict() if scheduler is not None else None,
        "grad_scaler": scaler.state_dict() if scaler is not None else None,
        "sampler": sampler.state_dict() if sampler is not None else None,
    }
    if params:
@ -85,6 +91,7 @@ def load_checkpoint(
    optimizer: Optional[Optimizer] = None,
    scheduler: Optional[_LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    strict: bool = False,
 ) -> Dict[str, Any]:
    """
@ -117,6 +124,7 @@ def load_checkpoint(
    load("optimizer", optimizer)
    load("scheduler", scheduler)
    load("grad_scaler", scaler)
    load("sampler", sampler)
    return checkpoint
@ -151,3 +159,120 @@ def average_checkpoints(
            avg[k] //= n
    return avg
 def save_checkpoint_with_global_batch_idx(
    out_dir: Path,
    global_batch_idx: int,
    model: Union[nn.Module, DDP],
    params: Optional[Dict[str, Any]] = None,
    optimizer: Optional[Optimizer] = None,
    scheduler: Optional[_LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    rank: int = 0,
 ):
    """Save training info after processing given number of batches.
    Args:
      out_dir:
        The directory to save the checkpoint.
      global_batch_idx:
        The number of batches processed so far from the very start of the
        training. The saved checkpoint will have the following filename:
            f'out_dir / checkpoint-{global_batch_idx}.pt'
      model:
        The neural network model whose `state_dict` will be saved in the
        checkpoint.
      params:
        A dict of training configurations to be saved.
      optimizer:
        The optimizer used in the training. Its `state_dict` will be saved.
      scheduler:
        The learning rate scheduler used in the training. Its `state_dict` will
        be saved.
      scaler:
        The scaler used for mix precision training. Its `state_dict` will
        be saved.
      sampler:
        The sampler used in the training dataset.
      rank:
        The rank ID used in DDP training of the current node. Set it to 0
        if DDP is not used.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    filename = out_dir / f"checkpoint-{global_batch_idx}.pt"
    save_checkpoint(
        filename=filename,
        model=model,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
        scaler=scaler,
        sampler=sampler,
        rank=rank,
    )
 def find_checkpoints(out_dir: Path) -> List[str]:
    """Find all available checkpoints in a directory.
    The checkpoint filenames have the form: `checkpoint-xxx.pt`
    where xxx is a numerical value.
    Args:
      out_dir:
        The directory where to search for checkpoints.
    Returns:
      Return a list of checkpoint filenames, sorted in descending
      order by the numerical value in the filename.
    """
    checkpoints = list(glob.glob(f"{out_dir}/checkpoint-[0-9]*.pt"))
    pattern = re.compile(r"checkpoint-([0-9]+).pt")
    idx_checkpoints = [
        (int(pattern.search(c).group(1)), c) for c in checkpoints
    ]
    idx_checkpoints = sorted(idx_checkpoints, reverse=True, key=lambda x: x[0])
    ans = [ic[1] for ic in idx_checkpoints]
    return ans
 def remove_checkpoints(
    out_dir: Path,
    topk: int,
    rank: int = 0,
 ):
    """Remove checkpoints from the given directory.
    We assume that checkpoint filename has the form `checkpoint-xxx.pt`
    where xxx is a number, representing the number of processed batches
    when saving that checkpoint. We sort checkpoints by filename and keep
    only the `topk` checkpoints with the highest `xxx`.
    Args:
      out_dir:
        The directory containing checkpoints to be removed.
      topk:
        Number of checkpoints to keep.
      rank:
        If using DDP for training, it is the rank of the current node.
        Use 0 if no DDP is used for training.
    """
    assert topk >= 1, topk
    if rank != 0:
        return
    checkpoints = find_checkpoints(out_dir)
    if len(checkpoints) == 0:
        logging.warn(f"No checkpoints found in {out_dir}")
        return
    if len(checkpoints) <= topk:
        return
    to_remove = checkpoints[topk:]
    for c in to_remove:
        os.remove(c)
--- a/icefall/diagnostics.py
+++ b/icefall/diagnostics.py
@ -1,5 +1,6 @@
 # Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey
-#                                                    Zengwei Yao)
+#                                                    Zengwei Yao
 #                                                    Mingshuang Luo)
 #
 # See ../LICENSE for clarification regarding multiple authors
 #
@ -17,7 +18,7 @@
 import random
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 import torch
 from torch import Tensor, nn
@ -28,22 +29,29 @@ class TensorDiagnosticOptions(object):
    Args:
      memory_limit:
-        The maximum number of bytes per tensor (limits how many copies
+        The maximum number of bytes per tensor
-        of the tensor we cache).
+        (limits how many copies of the tensor we cache).
      max_eig_dim:
        The maximum dimension for which we print out eigenvalues
        (limited for speed reasons).
    """
-    def __init__(self, memory_limit: int):
+    def __init__(self, memory_limit: int = (2 ** 20), max_eig_dim: int = 512):
        self.memory_limit = memory_limit
        self.max_eig_dim = max_eig_dim
    def dim_is_summarized(self, size: int):
        return size > 10 and size != 31
-def get_sum_abs_stats(
+def get_tensor_stats(
-    x: Tensor, dim: int, stats_type: str
+    x: Tensor,
    dim: int,
    stats_type: str,
 ) -> Tuple[Tensor, int]:
-    """Returns the sum-of-absolute-value of this Tensor, for each index into
+    """
-    the specified axis/dim of the tensor.
+    Returns the specified transformation of the Tensor (either x or x.abs()
    or (x > 0), summed over all but the index `dim`.
    Args:
      x:
@ -51,28 +59,38 @@ def get_sum_abs_stats(
      dim:
        Dimension with 0 <= dim < x.ndim
      stats_type:
-        Either "mean-abs" in which case the stats represent the mean absolute
+        The stats_type includes several types:
-        value, or "pos-ratio" in which case the stats represent the proportion
+        "abs" -> take abs() before summing
-        of positive values (actually: the tensor is count of positive values,
+        "positive" -> take (x > 0) before summing
-        count is the count of all values).
+        "rms" -> square before summing, we'll take sqrt later
-
+        "value -> just sum x itself
    Returns:
-      (sum_abs, count) where sum_abs is a Tensor of shape (x.shape[dim],),
+      stats: a Tensor of shape (x.shape[dim],).
-      and the count is an integer saying how many items were counted in
+      count: an integer saying how many items were counted in each element
-      each element of sum_abs.
+      of stats.
    """
-    if stats_type == "mean-abs":
+
    count = x.numel() // x.shape[dim]
    if stats_type == "eigs":
        x = x.transpose(dim, -1)
        x = x.reshape(-1, x.shape[-1])
        # shape of returned tensor: (s, s),
        # where s is size of dimension `dim` of original x.
        return torch.matmul(x.transpose(0, 1), x), count
    elif stats_type == "abs":
        x = x.abs()
-    else:
+    elif stats_type == "rms":
-        assert stats_type == "pos-ratio"
+        x = x ** 2
    elif stats_type == "positive":
        x = (x > 0).to(dtype=torch.float)
    else:
        assert stats_type == "value"
    orig_numel = x.numel()
    sum_dims = [d for d in range(x.ndim) if d != dim]
-    x = torch.sum(x, dim=sum_dims)
+    if len(sum_dims) > 0:
-    count = orig_numel // x.numel()
+        x = torch.sum(x, dim=sum_dims)
    x = x.flatten()
    return x, count
@ -83,43 +101,58 @@ def get_diagnostics_for_dim(
    sizes_same: bool,
    stats_type: str,
 ) -> str:
-    """This function gets diagnostics for a dimension of a module.
+    """
    This function gets diagnostics for a dimension of a module.
    Args:
      dim:
-        The dimension to analyze, with 0 <= dim < tensors[0].ndim
+        the dimension to analyze, with 0 <= dim < tensors[0].ndim
      tensors:
        List of cached tensors to get the stats
      options:
-        Options object
+        options object
      sizes_same:
        True if all the tensor sizes are the same on this dimension
-        stats_type: either "mean-abs" or "pos-ratio", dictates the type of
+        stats_type: either "abs" or "positive" or "eigs" or "value",
-        stats we accumulate, mean-abs is mean absolute value, "pos-ratio" is
+        imdictates the type of stats we accumulate, abs is mean absolute
-        proportion of positive to nonnegative values.
+        value, "positive" is proportion of positive to nonnegative values,
-
+        "eigs" is eigenvalues after doing outer product on this dim, sum
        over all other dimes.
    Returns:
      Diagnostic as a string, either percentiles or the actual values,
-      see the code.
+      see the code.  Will return the empty string if the diagnostics did
      not make sense to print out for this dimension, e.g. dimension
      mismatch and stats_type == "eigs".
    """
    # stats_and_counts is a list of pair (Tensor, int)
-    stats_and_counts = [get_sum_abs_stats(x, dim, stats_type) for x in tensors]
+    stats_and_counts = [get_tensor_stats(x, dim, stats_type) for x in tensors]
    stats = [x[0] for x in stats_and_counts]
    counts = [x[1] for x in stats_and_counts]
-    if sizes_same:
+
    if stats_type == "eigs":
        try:
            stats = torch.stack(stats).sum(dim=0)
        except:  # noqa
            return ""
        count = sum(counts)
        stats = stats / count
        stats, _ = torch.symeig(stats)
        stats = stats.abs().sqrt()
        # sqrt so it reflects data magnitude, like stddev- not variance
    elif sizes_same:
        stats = torch.stack(stats).sum(dim=0)
        count = sum(counts)
        stats = stats / count
    else:
        stats = [x[0] / x[1] for x in stats_and_counts]
        stats = torch.cat(stats, dim=0)
    if stats_type == "rms":
        stats = stats.sqrt()
-    # If `summarize` we print percentiles of the stats;
+    # if `summarize` we print percentiles of the stats; else,
-    # else, we print out individual elements.
+    # we print out individual elements.
    summarize = (not sizes_same) or options.dim_is_summarized(stats.numel())
    if summarize:
-        # Print out percentiles.
+        # print out percentiles.
        stats = stats.sort()[0]
        num_percentiles = 10
        size = stats.numel()
@ -129,12 +162,25 @@ def get_diagnostics_for_dim(
            percentiles.append(stats[index].item())
        percentiles = ["%.2g" % x for x in percentiles]
        percentiles = " ".join(percentiles)
-        return f"percentiles: [{percentiles}]"
+        ans = f"percentiles: [{percentiles}]"
    else:
-        stats = stats.tolist()
+        ans = stats.tolist()
-        stats = ["%.2g" % x for x in stats]
+        ans = ["%.2g" % x for x in ans]
-        stats = "[" + " ".join(stats) + "]"
+        ans = "[" + " ".join(ans) + "]"
-        return stats
+    if stats_type == "value":
        # This norm is useful because it is strictly less than the largest
        # sqrt(eigenvalue) of the variance, which we print out, and shows,
        # speaking in an approximate way, how much of that largest eigenvalue
        # can be attributed to the mean of the distribution.
        norm = (stats ** 2).sum().sqrt().item()
        mean = stats.mean().item()
        rms = (stats ** 2).mean().sqrt().item()
        ans += f", norm={norm:.2g}, mean={mean:.2g}, rms={rms:.2g}"
    else:
        mean = stats.mean().item()
        rms = (stats ** 2).mean().sqrt().item()
        ans += f", mean={mean:.2g}, rms={rms:.2g}"
    return ans
 def print_diagnostics_for_dim(
@ -153,17 +199,27 @@ def print_diagnostics_for_dim(
        Options object.
    """
-    for stats_type in ["mean-abs", "pos-ratio"]:
+    ndim = tensors[0].ndim
-        # stats_type will be "mean-abs" or "pos-ratio".
+    if ndim > 1:
        stats_types = ["abs", "positive", "value", "rms"]
        if tensors[0].shape[dim] <= options.max_eig_dim:
            stats_types.append("eigs")
    else:
        stats_types = ["value", "abs"]
    for stats_type in stats_types:
        sizes = [x.shape[dim] for x in tensors]
        sizes_same = all([x == sizes[0] for x in sizes])
        s = get_diagnostics_for_dim(
            dim, tensors, options, sizes_same, stats_type
        )
        if s == "":
            continue
        min_size = min(sizes)
        max_size = max(sizes)
        size_str = f"{min_size}" if sizes_same else f"{min_size}..{max_size}"
        # stats_type will be "abs" or "positive".
        print(f"module={name}, dim={dim}, size={size_str}, {stats_type} {s}")
@ -225,11 +281,15 @@ class TensorDiagnostic(object):
            # Ensure there is at least one dim.
            self.saved_tensors = [x.unsqueeze(0) for x in self.saved_tensors]
        try:
            device = torch.device("cuda")
        except:  # noqa
            device = torch.device("cpu")
        ndim = self.saved_tensors[0].ndim
        tensors = [x.to(device) for x in self.saved_tensors]
        for dim in range(ndim):
-            print_diagnostics_for_dim(
+            print_diagnostics_for_dim(self.name, dim, tensors, self.opts)
                self.name, dim, self.saved_tensors, self.opts
            )
 class ModelDiagnostic(object):
@ -240,11 +300,14 @@ class ModelDiagnostic(object):
        Options object.
    """
-    def __init__(self, opts: TensorDiagnosticOptions):
+    def __init__(self, opts: Optional[TensorDiagnosticOptions] = None):
        # In this dictionary, the keys are tensors names and the values
        # are corresponding TensorDiagnostic objects.
        if opts is None:
            self.opts = TensorDiagnosticOptions()
        else:
            self.opts = opts
        self.diagnostics = dict()
        self.opts = opts
    def __getitem__(self, name: str):
        if name not in self.diagnostics:
@ -321,7 +384,7 @@ def attach_diagnostics(
 def _test_tensor_diagnostic():
-    opts = TensorDiagnosticOptions(2 ** 20)
+    opts = TensorDiagnosticOptions(2 ** 20, 512)
    diagnostic = TensorDiagnostic(opts, "foo")
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@ -0,0 +1,21 @@
 # Usage: grep -v '^#' requirements-ci.txt  | xargs -n 1 -L 1 pip install
 # dependencies for GitHub actions
 #
 # See https://github.com/actions/setup-python#caching-packages-dependencies
 # numpy 1.20.x does not support python 3.6
 numpy==1.19
 pytest==7.1.0
 graphviz==0.19.1
 -f https://download.pytorch.org/whl/cpu/torch_stable.html torch==1.10.0+cpu
 -f https://download.pytorch.org/whl/cpu/torch_stable.html torchaudio==0.10.0+cpu
 -f https://k2-fsa.org/nightly/ k2==1.9.dev20211101+cpu.torch1.10.0
 git+https://github.com/lhotse-speech/lhotse
 kaldilm==1.11
 kaldialign==0.2
 sentencepiece==0.1.96
 tensorboard==2.8.0
 typeguard==2.13.3
--- a/requirements.txt
+++ b/requirements.txt
@ -3,4 +3,3 @@ kaldialign
 sentencepiece>=0.1.96
 tensorboard
 typeguard
 optimized_transducer
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/compile_hlg.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/compute_fbank_musan.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/generate_unique_lexicon.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/prepare_lang.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/test_prepare_lang.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/train_bpe_model.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer_stateless/conformer.py`