Missing steps in prepare.sh

2025-12-11 06:55:27 +00:00 · 2022-01-15 01:27:31 +00:00 · 2022-01-15 01:27:31 +00:00 · a7666d864c
commit a7666d864c
parent ccab93e8e2
2 changed files with 68 additions and 38 deletions
--- a/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
+++ b/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
@ -17,15 +17,34 @@ def get_args():
    return parser.parse_args()
-# Note: the functions "normalize" and "keep" implement the logic similar to
+class FisherSwbdNormalizer:
-# Kaldi's data prep scripts for Fisher:
+    """
-#   https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
+    Note: the functions "normalize" and "keep" implement the logic similar to
-# and for SWBD:
+    Kaldi's data prep scripts for Fisher:
-#   https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+      https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
    and for SWBD:
      https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
    One notable difference is that we don't change [cough], [lipsmack], etc. to [noise]. 
    We also don't implement all the edge cases of normalization from Kaldi 
    (hopefully won't make too much difference).
    """
 class Normalizer:
    def __init__(self) -> None:
        self.remove_regexp_before = re.compile(
            r"|".join([
                # special symbols
                r"\[\[SKIP.*\]\]",
                r"\[SKIP.*\]",
                r"\[PAUSE.*\]",
                r"\[SILENCE\]",
                r"<B_ASIDE>",
                r"<E_ASIDE>",
            ])
        )
        # tuples of (pattern, replacement)
        # note: Kaldi replaces sighs, coughs, etc with [noise].
        #       We don't do that here.
@ -63,19 +82,12 @@ class Normalizer:
            (re.compile(r"\s-\s"), r" "),
            (re.compile(r"\s-\s"), r" "),
            # special symbol with trailing dash
-            (re.compile(r"(\[\w+\])-"), r"\1"),
+            (re.compile(r"(\[.*?\])-"), r"\1"),
        ]
        # unwanted symbols in the transcripts
-        self.remove_regexp = re.compile(
+        self.remove_regexp_after = re.compile(
            r"|".join([
                # special symbols
                r"\[\[SKIP.*\]\]",
                r"\[SKIP.*\]",
                r"\[PAUSE.*\]",
                r"\[SILENCE\]",
                r"<B_ASIDE>",
                r"<E_ASIDE>",
                # remaining punctuation
                r"\.",
                r",",
@ -92,12 +104,15 @@ class Normalizer:
    def normalize(self, text: str) -> str:
        text = text.upper()
-        # first replace
+        # first remove
        text = self.remove_regexp_before.sub("", text)
        # then replace
        for pattern, sub in self.replace_regexps:
            text = pattern.sub(sub, text)
        # then remove
-        text = self.remove_regexp.sub("", text)
+        text = self.remove_regexp_after.sub("", text)
        # then clean up whitespace
        text = self.whitespace_regexp.sub(" ", text).strip()
@ -159,6 +174,8 @@ def test():
        "-[ADV]AN[TAGE]",
        "-[ADV]AN[TAGE]-",
        "[WEA[SONABLE]-/REASONABLE]",
        "[VOCALIZED-NOISE]-",
        "~BULL",
    ]:
        print(text)
        print(normalizer.normalize(text))
--- a/egs/fisher_swbd/ASR/prepare.sh
+++ b/egs/fisher_swbd/ASR/prepare.sh
@ -56,12 +56,6 @@ log() {
 log "dl_dir: $dl_dir"
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  log "Stage -1: Download LM"
  #[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
  #./local/download_lm.py --out-dir=$dl_dir/lm
 fi
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
@ -116,35 +110,60 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  set -x
  # Combine Fisher and SWBD recordings and supervisions
  lhotse combine \
   data/manifests/fisher/recordings.jsonl.gz \
   data/manifests/swbd/swbd_recordings.jsonl \
   data/manifests/fisher-swbd_recordings.jsonl.gz
  lhotse combine \
   data/manifests/fisher/supervisions.jsonl.gz \
   data/manifests/swbd/swbd_supervisions.jsonl \
   data/manifests/fisher-swbd_supervisions.jsonl.gz
  # Normalize text and remove supervisions that are not useful / hard to handle.
  python local/normalize_and_filter_supervisions.py \
    data/manifests/fisher-swbd_supervisions.jsonl.gz \
    data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
  # Create cuts that span whole recording sessions.
  lhotse cut simple \
    -r data/manifests/fisher-swbd_recordings.jsonl.gz \
    -s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
    data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz
  # Shuffle the cuts (pure bash pipes are fast).
  # We could technically skip this step but this helps ensure
  # SWBD is not only seen towards the end of training.
  gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \
    | shuf \
    | gzip -c \
    > data/manifests/fisher-swbd_cuts.jsonl.gz
  # Create train/dev split -- 20 sessions for dev is about ~2h, should be good.
  num_cuts="$(gunzip -c data/manifests/fisher-swbd_cuts.jsonl.gz | wc -l)"
  num_dev_sessions=20
  lhotse subset --first $num_dev_sessions \
    data/manifests/fisher-swbd_cuts.jsonl.gz \
    data/manifests/dev_fisher-swbd_cuts.jsonl.gz
  lhotse subset --last $((num_cuts-num_dev_sessions)) \
    data/manifests/fisher-swbd_cuts.jsonl.gz \
    data/manifests/train_fisher-swbd_cuts.jsonl.gz
  # Finally, split the full-session cuts into one cut per supervision segment.
  # In case any segments are overlapping we would discard the info about overlaps.
  # (overlaps are unlikely for this dataset because each cut sees only one channel).
  lhotse cut trim-to-supervisions \
    --discard-overlapping \
    data/manifests/train_fisher-swbd_cuts.jsonl.gz \
    data/manifests/train_utterances_fisher-swbd_cuts.jsonl.gz
  lhotse cut trim-to-supervisions \
    --discard-overlapping \
    data/manifests/dev_fisher-swbd_cuts.jsonl.gz \
    data/manifests/dev_utterances_fisher-swbd_cuts.jsonl.gz
  set +x
 fi
 # TODO: optional stage 5, compute features
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Dump transcripts for LM training"
  mkdir -p data/lm
@ -154,18 +173,6 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
    > data/lm/transcript_words.txt
 fi
 #if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
 #  log "Stage 3: Compute fbank for librispeech"
 #  mkdir -p data/fbank
 #  ./local/compute_fbank_librispeech.py
 #fi
 #
 #if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 #  log "Stage 4: Compute fbank for musan"
 #  mkdir -p data/fbank
 #  ./local/compute_fbank_musan.py
 #fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Prepare lexicon using g2p_en"
  lang_dir=data/lang_phone
@ -186,6 +193,12 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
    | awk '{print $0,NR+2}' \
    >> $lang_dir/words.txt
  # Add remaining special word symbols expected by LM scripts.
  num_words=$(wc -l $lang_dir/words.txt)
  echo "<s> $((num_words))"
  echo "</s> $((num_words+1))"
  echo "#0 $((num_words+2))"
  if [ ! -f $lang_dir/L_disambig.pt ]; then
    pip install g2p_en
    ./local/prepare_lang_g2pen.py --lang-dir $lang_dir