Missing steps in prepare.sh

2025-08-12 11:32:19 +00:00 · 2022-01-15 01:27:31 +00:00 · 2022-01-15 01:27:31 +00:00 · a7666d864c
commit a7666d864c
parent ccab93e8e2
2 changed files with 68 additions and 38 deletions
--- a/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
+++ b/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
@ -17,15 +17,34 @@ def get_args():
    return parser.parse_args()


-# Note: the functions "normalize" and "keep" implement the logic similar to
-# Kaldi's data prep scripts for Fisher:
-#   https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
-# and for SWBD:
-#   https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+class FisherSwbdNormalizer:
+    """
+    Note: the functions "normalize" and "keep" implement the logic similar to
+    Kaldi's data prep scripts for Fisher:
+      https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
+    and for SWBD:
+      https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+
+    One notable difference is that we don't change [cough], [lipsmack], etc. to [noise]. 
+    We also don't implement all the edge cases of normalization from Kaldi 
+    (hopefully won't make too much difference).
+    """


-class Normalizer:
    def __init__(self) -> None:
+
+        self.remove_regexp_before = re.compile(
+            r"|".join([
+                # special symbols
+                r"\[\[SKIP.*\]\]",
+                r"\[SKIP.*\]",
+                r"\[PAUSE.*\]",
+                r"\[SILENCE\]",
+                r"<B_ASIDE>",
+                r"<E_ASIDE>",
+            ])
+        )
+
        # tuples of (pattern, replacement)
        # note: Kaldi replaces sighs, coughs, etc with [noise].
        #       We don't do that here.
@ -63,19 +82,12 @@ class Normalizer:
            (re.compile(r"\s-\s"), r" "),
            (re.compile(r"\s-\s"), r" "),
            # special symbol with trailing dash
-            (re.compile(r"(\[\w+\])-"), r"\1"),
+            (re.compile(r"(\[.*?\])-"), r"\1"),
        ]

        # unwanted symbols in the transcripts
-        self.remove_regexp = re.compile(
+        self.remove_regexp_after = re.compile(
            r"|".join([
-                # special symbols
-                r"\[\[SKIP.*\]\]",
-                r"\[SKIP.*\]",
-                r"\[PAUSE.*\]",
-                r"\[SILENCE\]",
-                r"<B_ASIDE>",
-                r"<E_ASIDE>",
                # remaining punctuation
                r"\.",
                r",",
@ -92,12 +104,15 @@ class Normalizer:
    def normalize(self, text: str) -> str:
        text = text.upper()

-        # first replace
+        # first remove
+        text = self.remove_regexp_before.sub("", text)
+
+        # then replace
        for pattern, sub in self.replace_regexps:
            text = pattern.sub(sub, text)

        # then remove
-        text = self.remove_regexp.sub("", text)
+        text = self.remove_regexp_after.sub("", text)

        # then clean up whitespace
        text = self.whitespace_regexp.sub(" ", text).strip()
@ -159,6 +174,8 @@ def test():
        "-[ADV]AN[TAGE]",
        "-[ADV]AN[TAGE]-",
        "[WEA[SONABLE]-/REASONABLE]",
+        "[VOCALIZED-NOISE]-",
+        "~BULL",
    ]:
        print(text)
        print(normalizer.normalize(text))
--- a/egs/fisher_swbd/ASR/prepare.sh
+++ b/egs/fisher_swbd/ASR/prepare.sh
@ -56,12 +56,6 @@ log() {

 log "dl_dir: $dl_dir"

-if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
-  log "Stage -1: Download LM"
-  #[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
-  #./local/download_lm.py --out-dir=$dl_dir/lm
-fi
-
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"

@ -116,35 +110,60 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then

  set -x

+  # Combine Fisher and SWBD recordings and supervisions
  lhotse combine \
   data/manifests/fisher/recordings.jsonl.gz \
   data/manifests/swbd/swbd_recordings.jsonl \
   data/manifests/fisher-swbd_recordings.jsonl.gz
-
  lhotse combine \
   data/manifests/fisher/supervisions.jsonl.gz \
   data/manifests/swbd/swbd_supervisions.jsonl \
   data/manifests/fisher-swbd_supervisions.jsonl.gz

+  # Normalize text and remove supervisions that are not useful / hard to handle.
  python local/normalize_and_filter_supervisions.py \
    data/manifests/fisher-swbd_supervisions.jsonl.gz \
    data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
  
+  # Create cuts that span whole recording sessions.
  lhotse cut simple \
    -r data/manifests/fisher-swbd_recordings.jsonl.gz \
    -s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
    data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz
  
+  # Shuffle the cuts (pure bash pipes are fast).
+  # We could technically skip this step but this helps ensure
+  # SWBD is not only seen towards the end of training.
  gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \
    | shuf \
    | gzip -c \
    > data/manifests/fisher-swbd_cuts.jsonl.gz

+  # Create train/dev split -- 20 sessions for dev is about ~2h, should be good.
+  num_cuts="$(gunzip -c data/manifests/fisher-swbd_cuts.jsonl.gz | wc -l)"
+  num_dev_sessions=20
+  lhotse subset --first $num_dev_sessions \
+    data/manifests/fisher-swbd_cuts.jsonl.gz \
+    data/manifests/dev_fisher-swbd_cuts.jsonl.gz
+  lhotse subset --last $((num_cuts-num_dev_sessions)) \
+    data/manifests/fisher-swbd_cuts.jsonl.gz \
+    data/manifests/train_fisher-swbd_cuts.jsonl.gz
+
+  # Finally, split the full-session cuts into one cut per supervision segment.
+  # In case any segments are overlapping we would discard the info about overlaps.
+  # (overlaps are unlikely for this dataset because each cut sees only one channel).
+  lhotse cut trim-to-supervisions \
+    --discard-overlapping \
+    data/manifests/train_fisher-swbd_cuts.jsonl.gz \
+    data/manifests/train_utterances_fisher-swbd_cuts.jsonl.gz
+  lhotse cut trim-to-supervisions \
+    --discard-overlapping \
+    data/manifests/dev_fisher-swbd_cuts.jsonl.gz \
+    data/manifests/dev_utterances_fisher-swbd_cuts.jsonl.gz
+
  set +x
 fi

-# TODO: optional stage 5, compute features
-
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Dump transcripts for LM training"
  mkdir -p data/lm
@ -154,18 +173,6 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
    > data/lm/transcript_words.txt
 fi

-#if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-#  log "Stage 3: Compute fbank for librispeech"
-#  mkdir -p data/fbank
-#  ./local/compute_fbank_librispeech.py
-#fi
-#
-#if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-#  log "Stage 4: Compute fbank for musan"
-#  mkdir -p data/fbank
-#  ./local/compute_fbank_musan.py
-#fi
-
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Prepare lexicon using g2p_en"
  lang_dir=data/lang_phone
@ -186,6 +193,12 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
    | awk '{print $0,NR+2}' \
    >> $lang_dir/words.txt

+  # Add remaining special word symbols expected by LM scripts.
+  num_words=$(wc -l $lang_dir/words.txt)
+  echo "<s> $((num_words))"
+  echo "</s> $((num_words+1))"
+  echo "#0 $((num_words+2))"
+
  if [ ! -f $lang_dir/L_disambig.pt ]; then
    pip install g2p_en
    ./local/prepare_lang_g2pen.py --lang-dir $lang_dir