mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-16 04:32:19 +00:00
Add basic descriptions and results
This commit is contained in:
parent
4426715bc8
commit
aad2b7940d
4
egs/fisher_swbd/ASR/README.md
Normal file
4
egs/fisher_swbd/ASR/README.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
This is an ASR recipe for Switchboard and Switchboard+Fisher corpora.
|
49
egs/fisher_swbd/ASR/RESULTS.md
Normal file
49
egs/fisher_swbd/ASR/RESULTS.md
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
## Results
|
||||||
|
|
||||||
|
### SWBD BPE training results (Conformer-CTC)
|
||||||
|
|
||||||
|
#### 01-17-2022
|
||||||
|
|
||||||
|
This recipe is based on LibriSpeech.
|
||||||
|
Data preparation/normalization is a simplified version of the one found in Kaldi.
|
||||||
|
The data is resampled to 16kHz on-the-fly -- it's not needed, but makes it easier to combine with other corpora,
|
||||||
|
and likely doesn't affect the results too much.
|
||||||
|
The training set was only Switchboard, minus 20 held-out conversations (dev data, ~1h of speech).
|
||||||
|
This was tested only on the dev data.
|
||||||
|
We didn't tune the model, hparams, or language model in any special way vs. LibriSpeech recipe.
|
||||||
|
No rescoring was used (decoding method: "1best").
|
||||||
|
The model was trained on a single A100 GPU (24GB RAM) for 2 days.
|
||||||
|
|
||||||
|
WER (it includes `[LAUGHTER]`, `[NOISE]`, `[VOCALIZED-NOISE]` so the "real" WER is likely lower):
|
||||||
|
|
||||||
|
10 epochs (avg 5) : 19.58%
|
||||||
|
20 epochs (avg 10): 12.61%
|
||||||
|
30 epochs (avg 20): 11.24%
|
||||||
|
35 epochs (avg 20): 10.96%
|
||||||
|
40 epochs (avg 20): 10.94%
|
||||||
|
|
||||||
|
To reproduce the above result, use the following commands for training:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd egs/librispeech/ASR/conformer_ctc
|
||||||
|
./prepare.sh --swbd-only true
|
||||||
|
export CUDA_VISIBLE_DEVICES="0"
|
||||||
|
./conformer_ctc/train.py \
|
||||||
|
--lr-factor 1.25 \
|
||||||
|
--max-duration 200 \
|
||||||
|
--num-workers 14 \
|
||||||
|
--lang-dir data/lang_bpe_500 \
|
||||||
|
--num-epochs 40
|
||||||
|
```
|
||||||
|
|
||||||
|
and the following command for decoding
|
||||||
|
|
||||||
|
```
|
||||||
|
python conformer_ctc/decode.py \
|
||||||
|
--epoch 40 \
|
||||||
|
--avg 20 \
|
||||||
|
--method 1best
|
||||||
|
```
|
||||||
|
|
||||||
|
The tensorboard log for training is available at
|
||||||
|
<https://tensorboard.dev/experiment/0mvXl9BYRJ62J1fVnILm0w/>
|
@ -5,25 +5,18 @@ set -eou pipefail
|
|||||||
nj=15
|
nj=15
|
||||||
stage=-1
|
stage=-1
|
||||||
stop_stage=100
|
stop_stage=100
|
||||||
|
swbd_only=false
|
||||||
|
|
||||||
# We assume dl_dir (download dir) contains the following
|
# We assume dl_dir (download dir) contains the following
|
||||||
# directories and files. If not, they will be downloaded
|
# directories and files. Most of them can't be downloaded automatically
|
||||||
# by this script automatically.
|
# as they are not publically available and require a license purchased
|
||||||
|
# from the LDC.
|
||||||
#
|
#
|
||||||
# - $dl_dir/LibriSpeech
|
# - $dl_dir/{LDC2004S13,LDC2004T19,LDC2005S13,LDC2005T19}
|
||||||
# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it.
|
# Fisher LDC packages.
|
||||||
# You can download them from https://www.openslr.org/12
|
|
||||||
#
|
#
|
||||||
# - $dl_dir/lm
|
# - $dl_dir/LDC97S62
|
||||||
# This directory contains the following files downloaded from
|
# Switchboard LDC audio package (transcripts are auto-downloaded)
|
||||||
# http://www.openslr.org/resources/11
|
|
||||||
#
|
|
||||||
# - 3-gram.pruned.1e-7.arpa.gz
|
|
||||||
# - 3-gram.pruned.1e-7.arpa
|
|
||||||
# - 4-gram.arpa.gz
|
|
||||||
# - 4-gram.arpa
|
|
||||||
# - librispeech-vocab.txt
|
|
||||||
# - librispeech-lexicon.txt
|
|
||||||
#
|
#
|
||||||
# - $dl_dir/musan
|
# - $dl_dir/musan
|
||||||
# This directory contains the following directories downloaded from
|
# This directory contains the following directories downloaded from
|
||||||
@ -81,18 +74,14 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] && ! $swbd_only; then
|
||||||
log "Stage 1: Prepare Fisher manifests"
|
log "Stage 1: Prepare Fisher manifests"
|
||||||
# We assume that you have downloaded the LibriSpeech corpus
|
|
||||||
# to $dl_dir/LibriSpeech
|
|
||||||
mkdir -p data/manifests/fisher
|
mkdir -p data/manifests/fisher
|
||||||
lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
|
lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
log "Stage 2: Prepare SWBD manifests"
|
log "Stage 2: Prepare SWBD manifests"
|
||||||
# We assume that you have downloaded the LibriSpeech corpus
|
|
||||||
# to $dl_dir/LibriSpeech
|
|
||||||
mkdir -p data/manifests/swbd
|
mkdir -p data/manifests/swbd
|
||||||
lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd
|
lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd
|
||||||
fi
|
fi
|
||||||
@ -113,14 +102,21 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
set -x
|
set -x
|
||||||
|
|
||||||
# Combine Fisher and SWBD recordings and supervisions
|
# Combine Fisher and SWBD recordings and supervisions
|
||||||
lhotse combine \
|
if $swbd_only; then
|
||||||
data/manifests/fisher/recordings.jsonl.gz \
|
cp data/manifests/swbd/swbd_recordings.jsonl \
|
||||||
data/manifests/swbd/swbd_recordings.jsonl \
|
data/manifests/fisher-swbd_recordings.jsonl.gz
|
||||||
data/manifests/fisher-swbd_recordings.jsonl.gz
|
cp data/manifests/swbd/swbd_supervisions.jsonl \
|
||||||
lhotse combine \
|
data/manifests/fisher-swbd_supervisions.jsonl.gz
|
||||||
data/manifests/fisher/supervisions.jsonl.gz \
|
else
|
||||||
data/manifests/swbd/swbd_supervisions.jsonl \
|
lhotse combine \
|
||||||
data/manifests/fisher-swbd_supervisions.jsonl.gz
|
data/manifests/fisher/recordings.jsonl.gz \
|
||||||
|
data/manifests/swbd/swbd_recordings.jsonl \
|
||||||
|
data/manifests/fisher-swbd_recordings.jsonl.gz
|
||||||
|
lhotse combine \
|
||||||
|
data/manifests/fisher/supervisions.jsonl.gz \
|
||||||
|
data/manifests/swbd/swbd_supervisions.jsonl \
|
||||||
|
data/manifests/fisher-swbd_supervisions.jsonl.gz
|
||||||
|
fi
|
||||||
|
|
||||||
# Normalize text and remove supervisions that are not useful / hard to handle.
|
# Normalize text and remove supervisions that are not useful / hard to handle.
|
||||||
python local/normalize_and_filter_supervisions.py \
|
python local/normalize_and_filter_supervisions.py \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user