Update RESULTS.

2025-12-11 06:55:27 +00:00 · 2022-02-07 16:31:10 +08:00 · 2022-02-07 16:31:10 +08:00 · 3b93761199
commit 3b93761199
parent e1936fa5a8
4 changed files with 109 additions and 40 deletions
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -74,24 +74,53 @@ jobs:
          mkdir tmp
          cd tmp
          git lfs install
-          git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10
+          git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07
          cd ..
          tree tmp
-          soxi tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/*.wav
+          soxi tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/*.wav
-          ls -lh tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/*.wav
+          ls -lh tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/*.wav
-      - name: Run greedy search decoding
+      - name: Run greedy search decoding (max-sym-per-frame 1)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method greedy_search \
-            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/exp/pretrained.pt \
+            --max-sym-per-frame 1 \
-            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/data/lang_bpe_500/bpe.model \
+            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1089-134686-0001.wav \
+            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0001.wav \
+            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0002.wav
+            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav
      - name: Run greedy search decoding (max-sym-per-frame 2)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 2 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav
      - name: Run greedy search decoding (max-sym-per-frame 3)
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method greedy_search \
            --max-sym-per-frame 3 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav
      - name: Run beam search decoding
        shell: bash
@ -101,8 +130,22 @@ jobs:
          ./transducer_stateless/pretrained.py \
            --method beam_search \
            --beam-size 4 \
-            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/exp/pretrained.pt \
+            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \
-            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/data/lang_bpe_500/bpe.model \
+            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1089-134686-0001.wav \
+            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0001.wav \
+            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0002.wav
+            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav
      - name: Run modified beam search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method modified_beam_search \
            --beam-size 4 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav
--- a/README.md
+++ b/README.md
@ -80,16 +80,16 @@ We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open
 Using Conformer as encoder. The decoder consists of 1 embedding layer
 and 1 convolutional layer.
-The best WER using beam search with beam size 4 is:
+The best WER using modified beam search with beam size 4 is:
 |     | test-clean | test-other |
 |-----|------------|------------|
-| WER | 2.68       | 6.72       |
+| WER | 2.67       | 6.64       |
 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
-We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Rc4Is-3Yp9LbcEz_Iy8hfyenyHsyjvqE?usp=sharing)
+We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing)
 ### Aishell
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@ -4,7 +4,7 @@
 #### Conformer encoder + embedding decoder
-Using commit `4c1b3665ee6efb935f4dd93a80ff0e154b13efb6`.
+Using commit `TODO`.
 Conformer encoder + non-recurrent decoder. The decoder
 contains only an embedding layer and a Conv1d (with kernel size 2).
@ -12,54 +12,65 @@ contains only an embedding layer and a Conv1d (with kernel size 2).
 The WERs are
 |                                     | test-clean | test-other | comment                                  |
-|---------------------------|------------|------------|------------------------------------------|
+|-------------------------------------|------------|------------|------------------------------------------|
-| greedy search             | 2.69       | 6.81       | --epoch 71, --avg 15, --max-duration 100 |
+| greedy search (max sym per frame 1) | 2.68       | 6.71       | --epoch 61, --avg 18, --max-duration 100 |
-| beam search (beam size 4) | 2.68       | 6.72       | --epoch 71, --avg 15, --max-duration 100 |
+| greedy search (max sym per frame 2) | 2.69       | 6.71       | --epoch 61, --avg 18, --max-duration 100 |
 | greedy search (max sym per frame 3) | 2.69       | 6.71       | --epoch 61, --avg 18, --max-duration 100 |
 | modified beam search (beam size 4)  | 2.67       | 6.64       | --epoch 61, --avg 18, --max-duration 100 |
 The training command for reproducing is given below:
 ```
 cd egs/librispeech/ASR/
 ./prepare.sh
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./transducer_stateless/train.py \
  --world-size 4 \
  --num-epochs 76 \
  --start-epoch 0 \
  --exp-dir transducer_stateless/exp-full \
  --full-libri 1 \
-  --max-duration 250 \
+  --max-duration 300 \
-  --lr-factor 3
+  --lr-factor 5 \
  --bpe-model data/lang_bpe_500/bpe.model \
  --modified-transducer-prob 0.25
 ```
 The tensorboard training log can be found at
-<https://tensorboard.dev/experiment/qGdqzHnxS0WJ695OXfZDzA/#scalars&_smoothingWeight=0>
+<https://tensorboard.dev/experiment/qgvWkbF2R46FYA6ZMNmOjA/#scalars>
 The decoding command is:
 ```
-epoch=71
+epoch=61
-avg=15
+avg=18
 ## greedy search
-./transducer_stateless/decode.py \
+for sym in 1 2 3; do
  --epoch $epoch \
  --avg $avg \
  --exp-dir transducer_stateless/exp-full \
  --bpe-model ./data/lang_bpe_500/bpe.model \
  --max-duration 100
 ## beam search
  ./transducer_stateless/decode.py \
    --epoch $epoch \
    --avg $avg \
    --exp-dir transducer_stateless/exp-full \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --max-duration 100 \
-  --decoding-method beam_search \
+    --max-sym-per-frame $sym
 done
 ## modified beam search
 ./transducer_stateless/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir transducer_stateless/exp-full \
  --bpe-model ./data/lang_bpe_500/bpe.model \
  --max-duration 100 \
  --context-size 2 \
  --decoding-method modified_beam_search \
  --beam-size 4
 ```
 You can find a pretrained model by visiting
-<https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10>
+<https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07>
 #### Conformer encoder + LSTM decoder
--- a/egs/librispeech/ASR/transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/transducer_stateless/pretrained.py
@ -22,10 +22,11 @@ Usage:
        --checkpoint ./transducer_stateless/exp/pretrained.pt \
        --bpe-model ./data/lang_bpe_500/bpe.model \
        --method greedy_search \
        --max-sym-per-frame 1 \
        /path/to/foo.wav \
        /path/to/bar.wav \
-(1) beam search
+(2) beam search
 ./transducer_stateless/pretrained.py \
        --checkpoint ./transducer_stateless/exp/pretrained.pt \
        --bpe-model ./data/lang_bpe_500/bpe.model \
@ -34,6 +35,15 @@ Usage:
        /path/to/foo.wav \
        /path/to/bar.wav \
 (3) modified beam search
 ./transducer_stateless/pretrained.py \
        --checkpoint ./transducer_stateless/exp/pretrained.pt \
        --bpe-model ./data/lang_bpe_500/bpe.model \
        --method modified_beam_search \
        --beam-size 4 \
        /path/to/foo.wav \
        /path/to/bar.wav \
 You can also use `./transducer_stateless/exp/epoch-xx.pt`.
 Note: ./transducer_stateless/exp/pretrained.pt is generated by
@ -50,7 +60,7 @@ import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
-from beam_search import beam_search, greedy_search
+from beam_search import beam_search, greedy_search, modified_beam_search
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
@ -90,6 +100,7 @@ def get_parser():
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
        """,
    )
@ -107,7 +118,7 @@ def get_parser():
        "--beam-size",
        type=int,
        default=4,
-        help="Used only when --method is beam_search",
+        help="Used only when --method is beam_search and modified_beam_search ",
    )
    parser.add_argument(
@ -300,6 +311,10 @@ def main():
            hyp = beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        elif params.decoding_method == "modified_beam_search":
            hyp = modified_beam_search(
                model=model, encoder_out=encoder_out_i, beam=params.beam_size
            )
        else:
            raise ValueError(f"Unsupported method: {params.method}")