mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-09 14:05:33 +00:00
Update results.
This commit is contained in:
parent
61b0019ffd
commit
9f69dafc92
152
.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
vendored
Normal file
152
.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
vendored
Normal file
@ -0,0 +1,152 @@
|
||||
# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
|
||||
|
||||
# See ../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-100h
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
|
||||
jobs:
|
||||
run_pre_trained_transducer_stateless_multi_datasets_librispeech_100h:
|
||||
if: github.event.label.name == 'ready' || github.event_name == 'push'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-18.04]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
torch: ["1.10.0"]
|
||||
torchaudio: ["0.10.0"]
|
||||
k2-version: ["1.9.dev20211101"]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip pytest
|
||||
# numpy 1.20.x does not support python 3.6
|
||||
pip install numpy==1.19
|
||||
pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||
pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
|
||||
|
||||
python3 -m pip install git+https://github.com/lhotse-speech/lhotse
|
||||
python3 -m pip install kaldifeat
|
||||
# We are in ./icefall and there is a file: requirements.txt in it
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Install graphviz
|
||||
shell: bash
|
||||
run: |
|
||||
python3 -m pip install -qq graphviz
|
||||
sudo apt-get -qq install graphviz
|
||||
|
||||
- name: Download pre-trained model
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
cd egs/librispeech/ASR
|
||||
mkdir tmp
|
||||
cd tmp
|
||||
git lfs install
|
||||
git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21
|
||||
|
||||
cd ..
|
||||
tree tmp
|
||||
soxi tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/*.wav
|
||||
ls -lh tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/*.wav
|
||||
|
||||
- name: Run greedy search decoding (max-sym-per-frame 1)
|
||||
shell: bash
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:PYTHONPATH
|
||||
cd egs/librispeech/ASR
|
||||
./transducer_stateless_multi_datasets/pretrained.py \
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame 1 \
|
||||
--checkpoint ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/exp/pretrained.pt \
|
||||
--bpe-model ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/data/lang_bpe_500/bpe.model \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1089-134686-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0002.wav
|
||||
|
||||
- name: Run greedy search decoding (max-sym-per-frame 2)
|
||||
shell: bash
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:PYTHONPATH
|
||||
cd egs/librispeech/ASR
|
||||
./transducer_stateless_multi_datasets/pretrained.py \
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame 2 \
|
||||
--checkpoint ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/exp/pretrained.pt \
|
||||
--bpe-model ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/data/lang_bpe_500/bpe.model \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1089-134686-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0002.wav
|
||||
|
||||
- name: Run greedy search decoding (max-sym-per-frame 3)
|
||||
shell: bash
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:PYTHONPATH
|
||||
cd egs/librispeech/ASR
|
||||
./transducer_stateless_multi_datasets/pretrained.py \
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame 3 \
|
||||
--checkpoint ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/exp/pretrained.pt \
|
||||
--bpe-model ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/data/lang_bpe_500/bpe.model \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1089-134686-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0002.wav
|
||||
|
||||
- name: Run beam search decoding
|
||||
shell: bash
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
cd egs/librispeech/ASR
|
||||
./transducer_stateless_multi_datasets/pretrained.py \
|
||||
--method beam_search \
|
||||
--beam-size 4 \
|
||||
--checkpoint ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/exp/pretrained.pt \
|
||||
--bpe-model ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/data/lang_bpe_500/bpe.model \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1089-134686-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0002.wav
|
||||
|
||||
- name: Run modified beam search decoding
|
||||
shell: bash
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
cd egs/librispeech/ASR
|
||||
./transducer_stateless_multi_datasets/pretrained.py \
|
||||
--method modified_beam_search \
|
||||
--beam-size 4 \
|
||||
--checkpoint ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/exp/pretrained.pt \
|
||||
--bpe-model ./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/data/lang_bpe_500/bpe.model \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1089-134686-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0001.wav \
|
||||
./tmp/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21/test_wavs/1221-135766-0002.wav
|
||||
@ -9,11 +9,12 @@ for how to run models in this recipe.
|
||||
There are various folders containing the name `transducer` in this folder.
|
||||
The following table lists the differences among them.
|
||||
|
||||
| | Encoder | Decoder |
|
||||
|------------------------|-----------|--------------------|
|
||||
| `transducer` | Conformer | LSTM |
|
||||
| `transducer_stateless` | Conformer | Embedding + Conv1d |
|
||||
| `transducer_lstm ` | LSTM | LSTM |
|
||||
| | Encoder | Decoder | Comment |
|
||||
|---------------------------------------|-----------|--------------------|---------------------------------------------------|
|
||||
| `transducer` | Conformer | LSTM | |
|
||||
| `transducer_stateless` | Conformer | Embedding + Conv1d | |
|
||||
| `transducer_lstm` | LSTM | LSTM | |
|
||||
| `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data |
|
||||
|
||||
The decoder in `transducer_stateless` is modified from the paper
|
||||
[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
|
||||
|
||||
75
egs/librispeech/ASR/RESULTS-100hours.md
Normal file
75
egs/librispeech/ASR/RESULTS-100hours.md
Normal file
@ -0,0 +1,75 @@
|
||||
# Results for train-clean-100
|
||||
|
||||
This page shows the WERs for test-clean/test-other using only
|
||||
train-clean-100 subset as training data.
|
||||
|
||||
## Conformer encoder + embedding decoder
|
||||
|
||||
### 2022-02-21
|
||||
|
||||
| | test-clean | test-other | comment |
|
||||
|-------------------------------------|------------|------------|------------------------------------------|
|
||||
| greedy search (max sym per frame 1) | 6.34 | 16.7 | --epoch 57, --avg 17, --max-duration 100 |
|
||||
| greedy search (max sym per frame 2) | 6.34 | 16.7 | --epoch 57, --avg 17, --max-duration 100 |
|
||||
| greedy search (max sym per frame 3) | 6.34 | 16.7 | --epoch 57, --avg 17, --max-duration 100 |
|
||||
| modified beam search (beam size 4) | 6.31 | 16.3 | --epoch 57, --avg 17, --max-duration 100 |
|
||||
|
||||
|
||||
The training command for reproducing is given below:
|
||||
|
||||
```bash
|
||||
cd egs/librispeech/ASR/
|
||||
./prepare.sh
|
||||
./prepare_giga_speech.sh
|
||||
|
||||
export CUDA_VISIBLE_DEVICES="0,1"
|
||||
|
||||
./transducer_stateless_multi_datasets/train.py \
|
||||
--world-size 2 \
|
||||
--num-epochs 60 \
|
||||
--start-epoch 0 \
|
||||
--exp-dir transducer_stateless_multi_datasets/exp-100-2 \
|
||||
--full-libri 0 \
|
||||
--max-duration 300 \
|
||||
--lr-factor 1 \
|
||||
--bpe-model data/lang_bpe_500/bpe.model \
|
||||
--modified-transducer-prob 0.25
|
||||
--giga-prob 0.2
|
||||
```
|
||||
|
||||
The decoding command is given below:
|
||||
|
||||
```bash
|
||||
for epoch in 57; do
|
||||
for avg in 17; do
|
||||
for sym in 1 2 3; do
|
||||
./transducer_stateless_multi_datasets/decode.py \
|
||||
--epoch $epoch \
|
||||
--avg $avg \
|
||||
--exp-dir transducer_stateless_multi_datasets/exp-100-2 \
|
||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||
--max-duration 100 \
|
||||
--context-size 2 \
|
||||
--max-sym-per-frame $sym
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
epoch=57
|
||||
avg=17
|
||||
./transducer_stateless_multi_datasets/decode.py \
|
||||
--epoch $epoch \
|
||||
--avg $avg \
|
||||
--exp-dir transducer_stateless_multi_datasets/exp-100-2 \
|
||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||
--max-duration 100 \
|
||||
--context-size 2 \
|
||||
--decoding-method modified_beam_search \
|
||||
--beam-size 4
|
||||
```
|
||||
|
||||
The tensorboard log is available at
|
||||
<https://tensorboard.dev/experiment/qUEKzMnrTZmOz1EXPda9RA/>
|
||||
|
||||
A pre-trained model and decoding logs can be found at
|
||||
<https://huggingface.co/csukuangfj/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21>
|
||||
@ -191,15 +191,10 @@ def get_transducer_model(params: AttributeDict):
|
||||
decoder = get_decoder_model(params)
|
||||
joiner = get_joiner_model(params)
|
||||
|
||||
decoder_giga = get_decoder_model(params)
|
||||
joiner_giga = get_joiner_model(params)
|
||||
|
||||
model = Transducer(
|
||||
encoder=encoder,
|
||||
decoder=decoder,
|
||||
joiner=joiner,
|
||||
decoder_giga=decoder_giga,
|
||||
joiner_giga=joiner_giga,
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
@ -20,22 +20,23 @@
|
||||
# to a single one using model averaging.
|
||||
"""
|
||||
Usage:
|
||||
./transducer_stateless/export.py \
|
||||
--exp-dir ./transducer_stateless/exp \
|
||||
./transducer_stateless_multi_datasets/export.py \
|
||||
--exp-dir ./transducer_stateless_multi_datasets/exp \
|
||||
--bpe-model data/lang_bpe_500/bpe.model \
|
||||
--epoch 20 \
|
||||
--avg 10
|
||||
|
||||
It will generate a file exp_dir/pretrained.pt
|
||||
|
||||
To use the generated file with `transducer_stateless/decode.py`, you can do:
|
||||
To use the generated file with `transducer_stateless_multi_datasets/decode.py`,
|
||||
you can do::
|
||||
|
||||
cd /path/to/exp_dir
|
||||
ln -s pretrained.pt epoch-9999.pt
|
||||
|
||||
cd /path/to/egs/librispeech/ASR
|
||||
./transducer_stateless/decode.py \
|
||||
--exp-dir ./transducer_stateless/exp \
|
||||
./transducer_stateless_multi_datasets/decode.py \
|
||||
--exp-dir ./transducer_stateless_multi_datasets/exp \
|
||||
--epoch 9999 \
|
||||
--avg 1 \
|
||||
--max-duration 1 \
|
||||
@ -84,7 +85,7 @@ def get_parser():
|
||||
parser.add_argument(
|
||||
"--exp-dir",
|
||||
type=str,
|
||||
default="transducer_stateless/exp",
|
||||
default="transducer_stateless_multi_datasets/exp",
|
||||
help="""It specifies the directory where all training related
|
||||
files, e.g., checkpoints, log, etc, are saved
|
||||
""",
|
||||
@ -218,7 +219,9 @@ def main():
|
||||
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
|
||||
logging.info(f"averaging {filenames}")
|
||||
model.to(device)
|
||||
model.load_state_dict(average_checkpoints(filenames, device=device))
|
||||
model.load_state_dict(
|
||||
average_checkpoints(filenames, device=device), strict=False
|
||||
)
|
||||
|
||||
model.eval()
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ class LibriSpeech:
|
||||
return load_manifest(f)
|
||||
|
||||
def train_other_500_cuts(self) -> CutSet:
|
||||
f = self.args.manifest_dir / "cuts_train-other-500.json.gz"
|
||||
f = self.manifest_dir / "cuts_train-other-500.json.gz"
|
||||
logging.info(f"About to get train-other-500 cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
import k2
|
||||
import torch
|
||||
@ -34,8 +35,8 @@ class Transducer(nn.Module):
|
||||
encoder: EncoderInterface,
|
||||
decoder: nn.Module,
|
||||
joiner: nn.Module,
|
||||
decoder_giga: nn.Module,
|
||||
joiner_giga: nn.Module,
|
||||
decoder_giga: Optional[nn.Module] = None,
|
||||
joiner_giga: Optional[nn.Module] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@ -60,7 +61,9 @@ class Transducer(nn.Module):
|
||||
super().__init__()
|
||||
assert isinstance(encoder, EncoderInterface), type(encoder)
|
||||
assert hasattr(decoder, "blank_id")
|
||||
assert hasattr(decoder_giga, "blank_id")
|
||||
|
||||
if decoder_giga is not None:
|
||||
assert hasattr(decoder_giga, "blank_id")
|
||||
|
||||
self.encoder = encoder
|
||||
|
||||
|
||||
@ -738,8 +738,13 @@ def run(rank, world_size, args):
|
||||
# XS 10 hours
|
||||
# DEV 12 hours
|
||||
# Test 40 hours
|
||||
# train_giga_cuts = gigaspeech.train_M_cuts()
|
||||
train_giga_cuts = gigaspeech.train_S_cuts()
|
||||
if params.full_libri:
|
||||
logging.info("Using the L subset of GigaSpeech (2.5k hours)")
|
||||
train_giga_cuts = gigaspeech.train_L_cuts()
|
||||
else:
|
||||
logging.info("Using the S subset of GigaSpeech (250 hours)")
|
||||
train_giga_cuts = gigaspeech.train_S_cuts()
|
||||
|
||||
train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts)
|
||||
|
||||
if args.enable_musan:
|
||||
@ -868,7 +873,7 @@ def main():
|
||||
args = parser.parse_args()
|
||||
args.exp_dir = Path(args.exp_dir)
|
||||
|
||||
assert 0 < args.giga_prob < 1, args.giga_prob
|
||||
assert 0 <= args.giga_prob < 1, args.giga_prob
|
||||
|
||||
world_size = args.world_size
|
||||
assert world_size >= 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user