From aad2b7940d16885ce707e97f839e1942f9524482 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Mon, 17 Jan 2022 23:02:52 +0000
Subject: [PATCH] Add basic descriptions and results

---
 egs/fisher_swbd/ASR/README.md  |  4 +++
 egs/fisher_swbd/ASR/RESULTS.md | 49 ++++++++++++++++++++++++++++++++
 egs/fisher_swbd/ASR/prepare.sh | 52 ++++++++++++++++------------------
 3 files changed, 77 insertions(+), 28 deletions(-)
 create mode 100644 egs/fisher_swbd/ASR/README.md
 create mode 100644 egs/fisher_swbd/ASR/RESULTS.md

diff --git a/egs/fisher_swbd/ASR/README.md b/egs/fisher_swbd/ASR/README.md
new file mode 100644
index 000000000..3b7231a9e
--- /dev/null
+++ b/egs/fisher_swbd/ASR/README.md
@@ -0,0 +1,4 @@
+
+# Introduction
+
+This is an ASR recipe for Switchboard and Switchboard+Fisher corpora.
\ No newline at end of file
diff --git a/egs/fisher_swbd/ASR/RESULTS.md b/egs/fisher_swbd/ASR/RESULTS.md
new file mode 100644
index 000000000..b4f1d0db2
--- /dev/null
+++ b/egs/fisher_swbd/ASR/RESULTS.md
@@ -0,0 +1,49 @@
+## Results
+
+### SWBD BPE training results (Conformer-CTC)
+
+#### 01-17-2022
+
+This recipe is based on LibriSpeech. 
+Data preparation/normalization is a simplified version of the one found in Kaldi.
+The data is resampled to 16kHz on-the-fly -- it's not needed, but makes it easier to combine with other corpora,
+and likely doesn't affect the results too much.
+The training set was only Switchboard, minus 20 held-out conversations (dev data, ~1h of speech).
+This was tested only on the dev data.
+We didn't tune the model, hparams, or language model in any special way vs. LibriSpeech recipe.
+No rescoring was used (decoding method: "1best").
+The model was trained on a single A100 GPU (24GB RAM) for 2 days.
+
+WER (it includes `[LAUGHTER]`, `[NOISE]`, `[VOCALIZED-NOISE]` so the "real" WER is likely lower):
+
+10 epochs (avg 5) : 19.58%
+20 epochs (avg 10): 12.61%
+30 epochs (avg 20): 11.24%
+35 epochs (avg 20): 10.96%
+40 epochs (avg 20): 10.94%
+
+To reproduce the above result, use the following commands for training:
+
+```
+cd egs/librispeech/ASR/conformer_ctc
+./prepare.sh --swbd-only true
+export CUDA_VISIBLE_DEVICES="0"
+./conformer_ctc/train.py \
+  --lr-factor 1.25 \
+  --max-duration 200 \
+  --num-workers 14 \
+  --lang-dir data/lang_bpe_500 \
+  --num-epochs 40
+```
+
+and the following command for decoding
+
+```
+python conformer_ctc/decode.py \
+  --epoch 40 \
+  --avg 20 \
+  --method 1best
+```
+
+The tensorboard log for training is available at
+<https://tensorboard.dev/experiment/0mvXl9BYRJ62J1fVnILm0w/>
diff --git a/egs/fisher_swbd/ASR/prepare.sh b/egs/fisher_swbd/ASR/prepare.sh
index 9ef2d7363..4a23bff5c 100755
--- a/egs/fisher_swbd/ASR/prepare.sh
+++ b/egs/fisher_swbd/ASR/prepare.sh
@@ -5,25 +5,18 @@ set -eou pipefail
 nj=15
 stage=-1
 stop_stage=100
+swbd_only=false
 
 # We assume dl_dir (download dir) contains the following
-# directories and files. If not, they will be downloaded
-# by this script automatically.
+# directories and files. Most of them can't be downloaded automatically
+# as they are not publically available and require a license purchased 
+# from the LDC.
 #
-#  - $dl_dir/LibriSpeech
-#      You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it.
-#      You can download them from https://www.openslr.org/12
+#  - $dl_dir/{LDC2004S13,LDC2004T19,LDC2005S13,LDC2005T19}
+#      Fisher LDC packages.
 #
-#  - $dl_dir/lm
-#      This directory contains the following files downloaded from
-#       http://www.openslr.org/resources/11
-#
-#        - 3-gram.pruned.1e-7.arpa.gz
-#        - 3-gram.pruned.1e-7.arpa
-#        - 4-gram.arpa.gz
-#        - 4-gram.arpa
-#        - librispeech-vocab.txt
-#        - librispeech-lexicon.txt
+#  - $dl_dir/LDC97S62
+#      Switchboard LDC audio package (transcripts are auto-downloaded)
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
@@ -81,18 +74,14 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
   fi
 fi
 
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] && ! $swbd_only; then
   log "Stage 1: Prepare Fisher manifests"
-  # We assume that you have downloaded the LibriSpeech corpus
-  # to $dl_dir/LibriSpeech
   mkdir -p data/manifests/fisher
   lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
 fi
 
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   log "Stage 2: Prepare SWBD manifests"
-  # We assume that you have downloaded the LibriSpeech corpus
-  # to $dl_dir/LibriSpeech
   mkdir -p data/manifests/swbd
   lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd
 fi
@@ -113,14 +102,21 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   set -x
 
   # Combine Fisher and SWBD recordings and supervisions
-  lhotse combine \
-   data/manifests/fisher/recordings.jsonl.gz \
-   data/manifests/swbd/swbd_recordings.jsonl \
-   data/manifests/fisher-swbd_recordings.jsonl.gz
-  lhotse combine \
-   data/manifests/fisher/supervisions.jsonl.gz \
-   data/manifests/swbd/swbd_supervisions.jsonl \
-   data/manifests/fisher-swbd_supervisions.jsonl.gz
+  if $swbd_only; then
+    cp data/manifests/swbd/swbd_recordings.jsonl \
+      data/manifests/fisher-swbd_recordings.jsonl.gz
+    cp data/manifests/swbd/swbd_supervisions.jsonl \
+      data/manifests/fisher-swbd_supervisions.jsonl.gz
+  else
+    lhotse combine \
+      data/manifests/fisher/recordings.jsonl.gz \
+      data/manifests/swbd/swbd_recordings.jsonl \
+      data/manifests/fisher-swbd_recordings.jsonl.gz
+    lhotse combine \
+      data/manifests/fisher/supervisions.jsonl.gz \
+      data/manifests/swbd/swbd_supervisions.jsonl \
+      data/manifests/fisher-swbd_supervisions.jsonl.gz
+  fi
 
   # Normalize text and remove supervisions that are not useful / hard to handle.
   python local/normalize_and_filter_supervisions.py \