Update result for full libri + GigaSpeech using transducer_stateless. (#231)

2025-12-11 06:55:27 +00:00 · 2022-03-01 17:01:46 +08:00 · 2022-03-01 17:01:46 +08:00 · 05cb297858
commit 05cb297858
parent 72f838dee1
5 changed files with 264 additions and 7 deletions
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -0,0 +1,154 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-960h
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
 jobs:
  run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip pytest
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          python3 -m pip install kaldifeat
          # We are in ./icefall and there is a file: requirements.txt in it
          pip install -r requirements.txt
      - name: Install graphviz
        shell: bash
        run: |
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz
      - name: Download pre-trained model
        shell: bash
        run: |
          sudo apt-get -qq install git-lfs tree sox
          cd egs/librispeech/ASR
          mkdir tmp
          cd tmp
          git lfs install
          git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01
          cd ..
          tree tmp
          soxi tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/*.wav
          ls -lh tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/*.wav
      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 1 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 2 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 3 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
      - name: Run beam search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method beam_search \
            --beam-size 4 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
      - name: Run modified beam search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless_multi_datasets/pretrained.py \
            --method modified_beam_search \
            --beam-size 4 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
--- a/README.md
+++ b/README.md
@ -84,7 +84,7 @@ The best WER using modified beam search with beam size 4 is:
 |     | test-clean | test-other |
 |-----|------------|------------|
-| WER | 2.67       | 6.57       |
+| WER | 2.61       | 6.46       |
 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
--- a/egs/librispeech/ASR/RESULTS-100hours.md
+++ b/egs/librispeech/ASR/RESULTS-100hours.md
@ -7,6 +7,8 @@ train-clean-100 subset as training data.
 ### 2022-02-21
 Using commit `2332ba312d7ce72f08c7bac1e3312f7e3dd722dc`.
 |                                     | test-clean | test-other | comment                                  |
 |-------------------------------------|------------|------------|------------------------------------------|
 | greedy search (max sym per frame 1) | 6.34       | 16.7       | --epoch 57, --avg 17, --max-duration 100 |
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@ -52,11 +52,89 @@ avg=15
 #### Conformer encoder + embedding decoder
 Using commit `a8150021e01d34ecbd6198fe03a57eacf47a16f2`.
 Conformer encoder + non-recurrent decoder. The decoder
 contains only an embedding layer and a Conv1d (with kernel size 2).
 See
 - [./transducer_stateless](./transducer_stateless)
 - [./transducer_stateless_multi_datasets](./transducer_stateless_multi_datasets)
 ##### 2022-03-01
 Using commit `fill in it after merging`.
 It uses [GigaSpeech](https://github.com/SpeechColab/GigaSpeech)
 as extra training data. 20% of the time it selects a batch from L subset of
 GigaSpeech and 80% of the time it selects a batch from LibriSpeech.
 The WERs are
 |                                     | test-clean | test-other | comment                                  |
 |-------------------------------------|------------|------------|------------------------------------------|
 | greedy search (max sym per frame 1) | 2.64       | 6.55       | --epoch 39, --avg 15, --max-duration 100 |
 | modified beam search (beam size 4)  | 2.61       | 6.46       | --epoch 39, --avg 15, --max-duration 100 |
 The training command for reproducing is given below:
 ```bash
 cd egs/librispeech/ASR/
 ./prepare.sh
 ./prepare_giga_speech.sh
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./transducer_stateless_multi_datasets/train.py \
  --world-size 4 \
  --num-epochs 40 \
  --start-epoch 0 \
  --exp-dir transducer_stateless_multi_datasets/exp-full-2 \
  --full-libri 1 \
  --max-duration 300 \
  --lr-factor 5 \
  --bpe-model data/lang_bpe_500/bpe.model \
  --modified-transducer-prob 0.25 \
  --giga-prob 0.2
 ```
 The tensorboard training log can be found at
 <https://tensorboard.dev/experiment/xmo5oCgrRVelH9dCeOkYBg/>
 The decoding command is:
 ```bash
 epoch=39
 avg=15
 sym=1
 # greedy search
 ./transducer_stateless_multi_datasets/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir transducer_stateless_multi_datasets/exp-full-2 \
  --bpe-model ./data/lang_bpe_500/bpe.model \
  --max-duration 100 \
  --context-size 2 \
  --max-sym-per-frame $sym
 # modified beam search
 ./transducer_stateless_multi_datasets/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir transducer_stateless_multi_datasets/exp-full-2 \
  --bpe-model ./data/lang_bpe_500/bpe.model \
  --max-duration 100 \
  --context-size 2 \
  --decoding-method modified_beam_search \
  --beam-size 4
 ```
 ##### 2022-02-07
 Using commit `a8150021e01d34ecbd6198fe03a57eacf47a16f2`.
 The WERs are
 |                                     | test-clean | test-other | comment                                  |
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
@ -19,16 +19,39 @@
 """
 Usage:
 cd egs/librispeech/ASR/
 ./prepare.sh
 ./prepare_giga_speech.sh
 # 100-hours
 export CUDA_VISIBLE_DEVICES="0,1"
 ./transducer_stateless_multi_datasets/train.py \
  --world-size 2 \
  --num-epochs 60 \
  --start-epoch 0 \
  --exp-dir transducer_stateless_multi_datasets/exp-100-2 \
  --full-libri 0 \
  --max-duration 300 \
  --lr-factor 1 \
  --bpe-model data/lang_bpe_500/bpe.model \
  --modified-transducer-prob 0.25
  --giga-prob 0.2
 # 960-hours
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./transducer_stateless_multi_datasets/train.py \
  --world-size 4 \
-  --num-epochs 30 \
+  --num-epochs 40 \
  --start-epoch 0 \
-  --exp-dir transducer_stateless_multi_datasets/exp \
+  --exp-dir transducer_stateless_multi_datasets/exp-full-2 \
  --full-libri 1 \
-  --max-duration 250 \
+  --max-duration 300 \
-  --lr-factor 2.5
+  --lr-factor 5 \
  --bpe-model data/lang_bpe_500/bpe.model \
  --modified-transducer-prob 0.25 \
  --giga-prob 0.2
 """