From aad2b7940d16885ce707e97f839e1942f9524482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Mon, 17 Jan 2022 23:02:52 +0000 Subject: [PATCH] Add basic descriptions and results --- egs/fisher_swbd/ASR/README.md | 4 +++ egs/fisher_swbd/ASR/RESULTS.md | 49 ++++++++++++++++++++++++++++++++ egs/fisher_swbd/ASR/prepare.sh | 52 ++++++++++++++++------------------ 3 files changed, 77 insertions(+), 28 deletions(-) create mode 100644 egs/fisher_swbd/ASR/README.md create mode 100644 egs/fisher_swbd/ASR/RESULTS.md diff --git a/egs/fisher_swbd/ASR/README.md b/egs/fisher_swbd/ASR/README.md new file mode 100644 index 000000000..3b7231a9e --- /dev/null +++ b/egs/fisher_swbd/ASR/README.md @@ -0,0 +1,4 @@ + +# Introduction + +This is an ASR recipe for Switchboard and Switchboard+Fisher corpora. \ No newline at end of file diff --git a/egs/fisher_swbd/ASR/RESULTS.md b/egs/fisher_swbd/ASR/RESULTS.md new file mode 100644 index 000000000..b4f1d0db2 --- /dev/null +++ b/egs/fisher_swbd/ASR/RESULTS.md @@ -0,0 +1,49 @@ +## Results + +### SWBD BPE training results (Conformer-CTC) + +#### 01-17-2022 + +This recipe is based on LibriSpeech. +Data preparation/normalization is a simplified version of the one found in Kaldi. +The data is resampled to 16kHz on-the-fly -- it's not needed, but makes it easier to combine with other corpora, +and likely doesn't affect the results too much. +The training set was only Switchboard, minus 20 held-out conversations (dev data, ~1h of speech). +This was tested only on the dev data. +We didn't tune the model, hparams, or language model in any special way vs. LibriSpeech recipe. +No rescoring was used (decoding method: "1best"). +The model was trained on a single A100 GPU (24GB RAM) for 2 days. + +WER (it includes `[LAUGHTER]`, `[NOISE]`, `[VOCALIZED-NOISE]` so the "real" WER is likely lower): + +10 epochs (avg 5) : 19.58% +20 epochs (avg 10): 12.61% +30 epochs (avg 20): 11.24% +35 epochs (avg 20): 10.96% +40 epochs (avg 20): 10.94% + +To reproduce the above result, use the following commands for training: + +``` +cd egs/librispeech/ASR/conformer_ctc +./prepare.sh --swbd-only true +export CUDA_VISIBLE_DEVICES="0" +./conformer_ctc/train.py \ + --lr-factor 1.25 \ + --max-duration 200 \ + --num-workers 14 \ + --lang-dir data/lang_bpe_500 \ + --num-epochs 40 +``` + +and the following command for decoding + +``` +python conformer_ctc/decode.py \ + --epoch 40 \ + --avg 20 \ + --method 1best +``` + +The tensorboard log for training is available at + diff --git a/egs/fisher_swbd/ASR/prepare.sh b/egs/fisher_swbd/ASR/prepare.sh index 9ef2d7363..4a23bff5c 100755 --- a/egs/fisher_swbd/ASR/prepare.sh +++ b/egs/fisher_swbd/ASR/prepare.sh @@ -5,25 +5,18 @@ set -eou pipefail nj=15 stage=-1 stop_stage=100 +swbd_only=false # We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. +# directories and files. Most of them can't be downloaded automatically +# as they are not publically available and require a license purchased +# from the LDC. # -# - $dl_dir/LibriSpeech -# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it. -# You can download them from https://www.openslr.org/12 +# - $dl_dir/{LDC2004S13,LDC2004T19,LDC2005S13,LDC2005T19} +# Fisher LDC packages. # -# - $dl_dir/lm -# This directory contains the following files downloaded from -# http://www.openslr.org/resources/11 -# -# - 3-gram.pruned.1e-7.arpa.gz -# - 3-gram.pruned.1e-7.arpa -# - 4-gram.arpa.gz -# - 4-gram.arpa -# - librispeech-vocab.txt -# - librispeech-lexicon.txt +# - $dl_dir/LDC97S62 +# Switchboard LDC audio package (transcripts are auto-downloaded) # # - $dl_dir/musan # This directory contains the following directories downloaded from @@ -81,18 +74,14 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then fi fi -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] && ! $swbd_only; then log "Stage 1: Prepare Fisher manifests" - # We assume that you have downloaded the LibriSpeech corpus - # to $dl_dir/LibriSpeech mkdir -p data/manifests/fisher lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Prepare SWBD manifests" - # We assume that you have downloaded the LibriSpeech corpus - # to $dl_dir/LibriSpeech mkdir -p data/manifests/swbd lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd fi @@ -113,14 +102,21 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then set -x # Combine Fisher and SWBD recordings and supervisions - lhotse combine \ - data/manifests/fisher/recordings.jsonl.gz \ - data/manifests/swbd/swbd_recordings.jsonl \ - data/manifests/fisher-swbd_recordings.jsonl.gz - lhotse combine \ - data/manifests/fisher/supervisions.jsonl.gz \ - data/manifests/swbd/swbd_supervisions.jsonl \ - data/manifests/fisher-swbd_supervisions.jsonl.gz + if $swbd_only; then + cp data/manifests/swbd/swbd_recordings.jsonl \ + data/manifests/fisher-swbd_recordings.jsonl.gz + cp data/manifests/swbd/swbd_supervisions.jsonl \ + data/manifests/fisher-swbd_supervisions.jsonl.gz + else + lhotse combine \ + data/manifests/fisher/recordings.jsonl.gz \ + data/manifests/swbd/swbd_recordings.jsonl \ + data/manifests/fisher-swbd_recordings.jsonl.gz + lhotse combine \ + data/manifests/fisher/supervisions.jsonl.gz \ + data/manifests/swbd/swbd_supervisions.jsonl \ + data/manifests/fisher-swbd_supervisions.jsonl.gz + fi # Normalize text and remove supervisions that are not useful / hard to handle. python local/normalize_and_filter_supervisions.py \