From a7666d864c96b9c3bdf1a980535232b8f2db088c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Sat, 15 Jan 2022 01:27:31 +0000
Subject: [PATCH] Missing steps in prepare.sh

---
 .../normalize_and_filter_supervisions.py      | 51 +++++++++++------
 egs/fisher_swbd/ASR/prepare.sh                | 55 ++++++++++++-------
 2 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py b/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
index be9715578..9933d3a4f 100644
--- a/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
+++ b/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
@@ -17,15 +17,34 @@ def get_args():
     return parser.parse_args()
 
 
-# Note: the functions "normalize" and "keep" implement the logic similar to
-# Kaldi's data prep scripts for Fisher:
-#   https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
-# and for SWBD:
-#   https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+class FisherSwbdNormalizer:
+    """
+    Note: the functions "normalize" and "keep" implement the logic similar to
+    Kaldi's data prep scripts for Fisher:
+      https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
+    and for SWBD:
+      https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+
+    One notable difference is that we don't change [cough], [lipsmack], etc. to [noise]. 
+    We also don't implement all the edge cases of normalization from Kaldi 
+    (hopefully won't make too much difference).
+    """
 
 
-class Normalizer:
     def __init__(self) -> None:
+
+        self.remove_regexp_before = re.compile(
+            r"|".join([
+                # special symbols
+                r"\[\[SKIP.*\]\]",
+                r"\[SKIP.*\]",
+                r"\[PAUSE.*\]",
+                r"\[SILENCE\]",
+                r"<B_ASIDE>",
+                r"<E_ASIDE>",
+            ])
+        )
+
         # tuples of (pattern, replacement)
         # note: Kaldi replaces sighs, coughs, etc with [noise].
         #       We don't do that here.
@@ -63,19 +82,12 @@ class Normalizer:
             (re.compile(r"\s-\s"), r" "),
             (re.compile(r"\s-\s"), r" "),
             # special symbol with trailing dash
-            (re.compile(r"(\[\w+\])-"), r"\1"),
+            (re.compile(r"(\[.*?\])-"), r"\1"),
         ]
 
         # unwanted symbols in the transcripts
-        self.remove_regexp = re.compile(
+        self.remove_regexp_after = re.compile(
             r"|".join([
-                # special symbols
-                r"\[\[SKIP.*\]\]",
-                r"\[SKIP.*\]",
-                r"\[PAUSE.*\]",
-                r"\[SILENCE\]",
-                r"<B_ASIDE>",
-                r"<E_ASIDE>",
                 # remaining punctuation
                 r"\.",
                 r",",
@@ -92,12 +104,15 @@ class Normalizer:
     def normalize(self, text: str) -> str:
         text = text.upper()
 
-        # first replace
+        # first remove
+        text = self.remove_regexp_before.sub("", text)
+
+        # then replace
         for pattern, sub in self.replace_regexps:
             text = pattern.sub(sub, text)
 
         # then remove
-        text = self.remove_regexp.sub("", text)
+        text = self.remove_regexp_after.sub("", text)
 
         # then clean up whitespace
         text = self.whitespace_regexp.sub(" ", text).strip()
@@ -159,6 +174,8 @@ def test():
         "-[ADV]AN[TAGE]",
         "-[ADV]AN[TAGE]-",
         "[WEA[SONABLE]-/REASONABLE]",
+        "[VOCALIZED-NOISE]-",
+        "~BULL",
     ]:
         print(text)
         print(normalizer.normalize(text))
diff --git a/egs/fisher_swbd/ASR/prepare.sh b/egs/fisher_swbd/ASR/prepare.sh
index dc89dfdac..859e0d34e 100755
--- a/egs/fisher_swbd/ASR/prepare.sh
+++ b/egs/fisher_swbd/ASR/prepare.sh
@@ -56,12 +56,6 @@ log() {
 
 log "dl_dir: $dl_dir"
 
-if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
-  log "Stage -1: Download LM"
-  #[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
-  #./local/download_lm.py --out-dir=$dl_dir/lm
-fi
-
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
   log "Stage 0: Download data"
 
@@ -116,35 +110,60 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 
   set -x
 
+  # Combine Fisher and SWBD recordings and supervisions
   lhotse combine \
    data/manifests/fisher/recordings.jsonl.gz \
    data/manifests/swbd/swbd_recordings.jsonl \
    data/manifests/fisher-swbd_recordings.jsonl.gz
-
   lhotse combine \
    data/manifests/fisher/supervisions.jsonl.gz \
    data/manifests/swbd/swbd_supervisions.jsonl \
    data/manifests/fisher-swbd_supervisions.jsonl.gz
 
+  # Normalize text and remove supervisions that are not useful / hard to handle.
   python local/normalize_and_filter_supervisions.py \
     data/manifests/fisher-swbd_supervisions.jsonl.gz \
     data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
   
+  # Create cuts that span whole recording sessions.
   lhotse cut simple \
     -r data/manifests/fisher-swbd_recordings.jsonl.gz \
     -s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
     data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz
   
+  # Shuffle the cuts (pure bash pipes are fast).
+  # We could technically skip this step but this helps ensure
+  # SWBD is not only seen towards the end of training.
   gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \
     | shuf \
     | gzip -c \
     > data/manifests/fisher-swbd_cuts.jsonl.gz
 
+  # Create train/dev split -- 20 sessions for dev is about ~2h, should be good.
+  num_cuts="$(gunzip -c data/manifests/fisher-swbd_cuts.jsonl.gz | wc -l)"
+  num_dev_sessions=20
+  lhotse subset --first $num_dev_sessions \
+    data/manifests/fisher-swbd_cuts.jsonl.gz \
+    data/manifests/dev_fisher-swbd_cuts.jsonl.gz
+  lhotse subset --last $((num_cuts-num_dev_sessions)) \
+    data/manifests/fisher-swbd_cuts.jsonl.gz \
+    data/manifests/train_fisher-swbd_cuts.jsonl.gz
+
+  # Finally, split the full-session cuts into one cut per supervision segment.
+  # In case any segments are overlapping we would discard the info about overlaps.
+  # (overlaps are unlikely for this dataset because each cut sees only one channel).
+  lhotse cut trim-to-supervisions \
+    --discard-overlapping \
+    data/manifests/train_fisher-swbd_cuts.jsonl.gz \
+    data/manifests/train_utterances_fisher-swbd_cuts.jsonl.gz
+  lhotse cut trim-to-supervisions \
+    --discard-overlapping \
+    data/manifests/dev_fisher-swbd_cuts.jsonl.gz \
+    data/manifests/dev_utterances_fisher-swbd_cuts.jsonl.gz
+
   set +x
 fi
 
-# TODO: optional stage 5, compute features
-
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
   log "Stage 6: Dump transcripts for LM training"
   mkdir -p data/lm
@@ -154,18 +173,6 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
     > data/lm/transcript_words.txt
 fi
 
-#if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-#  log "Stage 3: Compute fbank for librispeech"
-#  mkdir -p data/fbank
-#  ./local/compute_fbank_librispeech.py
-#fi
-#
-#if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-#  log "Stage 4: Compute fbank for musan"
-#  mkdir -p data/fbank
-#  ./local/compute_fbank_musan.py
-#fi
-
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
   log "Stage 7: Prepare lexicon using g2p_en"
   lang_dir=data/lang_phone
@@ -186,6 +193,12 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
     | awk '{print $0,NR+2}' \
     >> $lang_dir/words.txt
 
+  # Add remaining special word symbols expected by LM scripts.
+  num_words=$(wc -l $lang_dir/words.txt)
+  echo "<s> $((num_words))"
+  echo "</s> $((num_words+1))"
+  echo "#0 $((num_words+2))"
+
   if [ ! -f $lang_dir/L_disambig.pt ]; then
     pip install g2p_en
     ./local/prepare_lang_g2pen.py --lang-dir $lang_dir