mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Update result for full libri + GigaSpeech using transducer_stateless. (#231)
This commit is contained in:
parent
72f838dee1
commit
05cb297858
154
.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
vendored
Normal file
154
.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
vendored
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
|
||||||
|
|
||||||
|
# See ../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-960h
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [labeled]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h:
|
||||||
|
if: github.event.label.name == 'ready' || github.event_name == 'push'
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-18.04]
|
||||||
|
python-version: [3.7, 3.8, 3.9]
|
||||||
|
torch: ["1.10.0"]
|
||||||
|
torchaudio: ["0.10.0"]
|
||||||
|
k2-version: ["1.9.dev20211101"]
|
||||||
|
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v1
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
python3 -m pip install --upgrade pip pytest
|
||||||
|
# numpy 1.20.x does not support python 3.6
|
||||||
|
pip install numpy==1.19
|
||||||
|
pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||||
|
pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
|
||||||
|
|
||||||
|
python3 -m pip install git+https://github.com/lhotse-speech/lhotse
|
||||||
|
python3 -m pip install kaldifeat
|
||||||
|
# We are in ./icefall and there is a file: requirements.txt in it
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Install graphviz
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
python3 -m pip install -qq graphviz
|
||||||
|
sudo apt-get -qq install graphviz
|
||||||
|
|
||||||
|
- name: Download pre-trained model
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt-get -qq install git-lfs tree sox
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
mkdir tmp
|
||||||
|
cd tmp
|
||||||
|
git lfs install
|
||||||
|
git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01
|
||||||
|
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
tree tmp
|
||||||
|
soxi tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/*.wav
|
||||||
|
ls -lh tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/*.wav
|
||||||
|
|
||||||
|
- name: Run greedy search decoding (max-sym-per-frame 1)
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:PYTHONPATH
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
./transducer_stateless_multi_datasets/pretrained.py \
|
||||||
|
--method greedy_search \
|
||||||
|
--max-sym-per-frame 1 \
|
||||||
|
--checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
|
||||||
|
--bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
- name: Run greedy search decoding (max-sym-per-frame 2)
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:PYTHONPATH
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
./transducer_stateless_multi_datasets/pretrained.py \
|
||||||
|
--method greedy_search \
|
||||||
|
--max-sym-per-frame 2 \
|
||||||
|
--checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
|
||||||
|
--bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
- name: Run greedy search decoding (max-sym-per-frame 3)
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:PYTHONPATH
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
./transducer_stateless_multi_datasets/pretrained.py \
|
||||||
|
--method greedy_search \
|
||||||
|
--max-sym-per-frame 3 \
|
||||||
|
--checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
|
||||||
|
--bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
- name: Run beam search decoding
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
./transducer_stateless_multi_datasets/pretrained.py \
|
||||||
|
--method beam_search \
|
||||||
|
--beam-size 4 \
|
||||||
|
--checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
|
||||||
|
--bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
|
||||||
|
- name: Run modified beam search decoding
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
./transducer_stateless_multi_datasets/pretrained.py \
|
||||||
|
--method modified_beam_search \
|
||||||
|
--beam-size 4 \
|
||||||
|
--checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \
|
||||||
|
--bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \
|
||||||
|
./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav
|
@ -84,7 +84,7 @@ The best WER using modified beam search with beam size 4 is:
|
|||||||
|
|
||||||
| | test-clean | test-other |
|
| | test-clean | test-other |
|
||||||
|-----|------------|------------|
|
|-----|------------|------------|
|
||||||
| WER | 2.67 | 6.57 |
|
| WER | 2.61 | 6.46 |
|
||||||
|
|
||||||
Note: No auxiliary losses are used in the training and no LMs are used
|
Note: No auxiliary losses are used in the training and no LMs are used
|
||||||
in the decoding.
|
in the decoding.
|
||||||
|
@ -7,6 +7,8 @@ train-clean-100 subset as training data.
|
|||||||
|
|
||||||
### 2022-02-21
|
### 2022-02-21
|
||||||
|
|
||||||
|
Using commit `2332ba312d7ce72f08c7bac1e3312f7e3dd722dc`.
|
||||||
|
|
||||||
| | test-clean | test-other | comment |
|
| | test-clean | test-other | comment |
|
||||||
|-------------------------------------|------------|------------|------------------------------------------|
|
|-------------------------------------|------------|------------|------------------------------------------|
|
||||||
| greedy search (max sym per frame 1) | 6.34 | 16.7 | --epoch 57, --avg 17, --max-duration 100 |
|
| greedy search (max sym per frame 1) | 6.34 | 16.7 | --epoch 57, --avg 17, --max-duration 100 |
|
||||||
|
@ -52,11 +52,89 @@ avg=15
|
|||||||
|
|
||||||
#### Conformer encoder + embedding decoder
|
#### Conformer encoder + embedding decoder
|
||||||
|
|
||||||
Using commit `a8150021e01d34ecbd6198fe03a57eacf47a16f2`.
|
|
||||||
|
|
||||||
Conformer encoder + non-recurrent decoder. The decoder
|
Conformer encoder + non-recurrent decoder. The decoder
|
||||||
contains only an embedding layer and a Conv1d (with kernel size 2).
|
contains only an embedding layer and a Conv1d (with kernel size 2).
|
||||||
|
|
||||||
|
See
|
||||||
|
|
||||||
|
- [./transducer_stateless](./transducer_stateless)
|
||||||
|
- [./transducer_stateless_multi_datasets](./transducer_stateless_multi_datasets)
|
||||||
|
|
||||||
|
##### 2022-03-01
|
||||||
|
|
||||||
|
Using commit `fill in it after merging`.
|
||||||
|
|
||||||
|
It uses [GigaSpeech](https://github.com/SpeechColab/GigaSpeech)
|
||||||
|
as extra training data. 20% of the time it selects a batch from L subset of
|
||||||
|
GigaSpeech and 80% of the time it selects a batch from LibriSpeech.
|
||||||
|
|
||||||
|
The WERs are
|
||||||
|
|
||||||
|
| | test-clean | test-other | comment |
|
||||||
|
|-------------------------------------|------------|------------|------------------------------------------|
|
||||||
|
| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39, --avg 15, --max-duration 100 |
|
||||||
|
| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39, --avg 15, --max-duration 100 |
|
||||||
|
|
||||||
|
The training command for reproducing is given below:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd egs/librispeech/ASR/
|
||||||
|
./prepare.sh
|
||||||
|
./prepare_giga_speech.sh
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
||||||
|
|
||||||
|
./transducer_stateless_multi_datasets/train.py \
|
||||||
|
--world-size 4 \
|
||||||
|
--num-epochs 40 \
|
||||||
|
--start-epoch 0 \
|
||||||
|
--exp-dir transducer_stateless_multi_datasets/exp-full-2 \
|
||||||
|
--full-libri 1 \
|
||||||
|
--max-duration 300 \
|
||||||
|
--lr-factor 5 \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--modified-transducer-prob 0.25 \
|
||||||
|
--giga-prob 0.2
|
||||||
|
```
|
||||||
|
|
||||||
|
The tensorboard training log can be found at
|
||||||
|
<https://tensorboard.dev/experiment/xmo5oCgrRVelH9dCeOkYBg/>
|
||||||
|
|
||||||
|
The decoding command is:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
epoch=39
|
||||||
|
avg=15
|
||||||
|
sym=1
|
||||||
|
|
||||||
|
# greedy search
|
||||||
|
./transducer_stateless_multi_datasets/decode.py \
|
||||||
|
--epoch $epoch \
|
||||||
|
--avg $avg \
|
||||||
|
--exp-dir transducer_stateless_multi_datasets/exp-full-2 \
|
||||||
|
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||||
|
--max-duration 100 \
|
||||||
|
--context-size 2 \
|
||||||
|
--max-sym-per-frame $sym
|
||||||
|
|
||||||
|
# modified beam search
|
||||||
|
./transducer_stateless_multi_datasets/decode.py \
|
||||||
|
--epoch $epoch \
|
||||||
|
--avg $avg \
|
||||||
|
--exp-dir transducer_stateless_multi_datasets/exp-full-2 \
|
||||||
|
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||||
|
--max-duration 100 \
|
||||||
|
--context-size 2 \
|
||||||
|
--decoding-method modified_beam_search \
|
||||||
|
--beam-size 4
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
##### 2022-02-07
|
||||||
|
|
||||||
|
Using commit `a8150021e01d34ecbd6198fe03a57eacf47a16f2`.
|
||||||
|
|
||||||
|
|
||||||
The WERs are
|
The WERs are
|
||||||
|
|
||||||
| | test-clean | test-other | comment |
|
| | test-clean | test-other | comment |
|
||||||
|
@ -19,16 +19,39 @@
|
|||||||
"""
|
"""
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR/
|
||||||
|
./prepare.sh
|
||||||
|
./prepare_giga_speech.sh
|
||||||
|
|
||||||
|
# 100-hours
|
||||||
|
export CUDA_VISIBLE_DEVICES="0,1"
|
||||||
|
|
||||||
|
./transducer_stateless_multi_datasets/train.py \
|
||||||
|
--world-size 2 \
|
||||||
|
--num-epochs 60 \
|
||||||
|
--start-epoch 0 \
|
||||||
|
--exp-dir transducer_stateless_multi_datasets/exp-100-2 \
|
||||||
|
--full-libri 0 \
|
||||||
|
--max-duration 300 \
|
||||||
|
--lr-factor 1 \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--modified-transducer-prob 0.25
|
||||||
|
--giga-prob 0.2
|
||||||
|
|
||||||
|
# 960-hours
|
||||||
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
||||||
|
|
||||||
./transducer_stateless_multi_datasets/train.py \
|
./transducer_stateless_multi_datasets/train.py \
|
||||||
--world-size 4 \
|
--world-size 4 \
|
||||||
--num-epochs 30 \
|
--num-epochs 40 \
|
||||||
--start-epoch 0 \
|
--start-epoch 0 \
|
||||||
--exp-dir transducer_stateless_multi_datasets/exp \
|
--exp-dir transducer_stateless_multi_datasets/exp-full-2 \
|
||||||
--full-libri 1 \
|
--full-libri 1 \
|
||||||
--max-duration 250 \
|
--max-duration 300 \
|
||||||
--lr-factor 2.5
|
--lr-factor 5 \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--modified-transducer-prob 0.25 \
|
||||||
|
--giga-prob 0.2
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user