minor updates

2025-12-11 06:55:27 +00:00 · 2023-08-18 11:19:08 +08:00 · 2023-08-18 11:19:08 +08:00 · ab07e58613
commit ab07e58613
parent 58d9088010
5 changed files with 131 additions and 75 deletions
--- a/egs/swbd/ASR/local/normalize_and_filter_supervisions.py
+++ b/egs/swbd/ASR/local/normalize_and_filter_supervisions.py
@ -53,7 +53,6 @@ def insert_groups_and_capitalize_1s(match):
    return f"{match.group(1)}".upper() + "'s"


-# fmt: off
 class FisherSwbdNormalizer:
    """Note: the functions "normalize" and "keep" implement the logic
    similar to Kaldi's data prep scripts for Fisher and SWBD: One
@ -62,18 +61,21 @@ class FisherSwbdNormalizer:
    normalization from Kaldi (hopefully won't make too much
    difference).
    """
-    def __init__(self) -> None:

+    def __init__(self) -> None:
        self.remove_regexp_before = re.compile(
-            r"|".join([
-                # special symbols
-                r"\[\[skip.*\]\]",
-                r"\[skip.*\]",
-                r"\[pause.*\]",
-                r"\[silence\]",
-                r"<b_aside>",
-                r"<e_aside>",
-            ])
+            r"|".join(
+                [
+                    # special symbols
+                    r"\[\[skip.*\]\]",
+                    r"\[skip.*\]",
+                    r"\[pause.*\]",
+                    r"\[silence\]",
+                    r"<b_aside>",
+                    r"<e_aside>",
+                    r"_1",
+                ]
+            )
        )

        # tuples of (pattern, replacement)
@ -102,14 +104,54 @@ class FisherSwbdNormalizer:
            (re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
            (re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
            # abbreviations
-            (re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
-            (re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
-            (re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
-            (re.compile(r"\._",), r" "),
-            (re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
-            (re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
-            (re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
-            (re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
+            (
+                re.compile(
+                    r"(\w)\.(\w)\.(\w)",
+                ),
+                insert_groups_and_capitalize_3,
+            ),
+            (
+                re.compile(
+                    r"(\w)\.(\w)",
+                ),
+                insert_groups_and_capitalize_2,
+            ),
+            (
+                re.compile(
+                    r"([a-h,j-z])\.",
+                ),
+                insert_groups_and_capitalize_1,
+            ),
+            (
+                re.compile(
+                    r"\._",
+                ),
+                r" ",
+            ),
+            (
+                re.compile(
+                    r"_(\w)",
+                ),
+                insert_groups_and_capitalize_1,
+            ),
+            (
+                re.compile(
+                    r"(\w)\.s",
+                ),
+                insert_groups_and_capitalize_1s,
+            ),
+            (
+                re.compile(
+                    r"([A-Z])\'s",
+                ),
+                insert_groups_and_capitalize_1s,
+            ),
+            (
+                re.compile(
+                    r"(\s\w\b|^\w\b)",
+                ),
+                insert_groups_and_capitalize_1,
+            ),
            # words between apostrophes
            (re.compile(r"'(\S*?)'"), r"\1"),
            # dangling dashes (2 passes)
@ -119,25 +161,29 @@ class FisherSwbdNormalizer:
            (re.compile(r"(\[.*?\])-"), r"\1"),
            # Just remove all dashes
            (re.compile(r"-"), r" "),
-
-            # Fix an issue related to [vocalized-noise]
-            (re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"),
        ]

        # unwanted symbols in the transcripts
        self.remove_regexp_after = re.compile(
-            r"|".join([
-                # remaining punctuation
-                r"\.",
-                r",",
-                r"\?",
-                r"{",
-                r"}",
-                r"~",
-                r"_\d",
-            ])
+            r"|".join(
+                [
+                    # remaining punctuation
+                    r"\.",
+                    r",",
+                    r"\?",
+                    r"{",
+                    r"}",
+                    r"~",
+                    r"_\d",
+                ]
+            )
        )

+        self.post_fixes = [
+            # Fix an issue related to [VOCALIZED NOISE] after dash removal
+            (re.compile(r"\[vocalized noise\]"), "[vocalized-noise]"),
+        ]
+
        self.whitespace_regexp = re.compile(r"\s+")

    def normalize(self, text: str) -> str:
@ -153,11 +199,14 @@ class FisherSwbdNormalizer:
        # then remove
        text = self.remove_regexp_after.sub("", text)

+        # post fixes
+        for pattern, sub in self.post_fixes:
+            text = pattern.sub(sub, text)
+
        # then clean up whitespace
        text = self.whitespace_regexp.sub(" ", text).strip()

        return text.upper()
-# fmt: on


 def keep(sup: SupervisionSegment) -> bool:
@ -186,7 +235,7 @@ def main():
                skip += 1
                continue

-            sup.text = normalizer.normalize(sup.text)
+            sup.text = normalizer.normalize(sup.text).upper()
            if not sup.text:
                skip += 1
                continue
@ -219,8 +268,9 @@ def test():
        "[VOCALIZED-NOISE]-",
        "~BULL",
        "Frank E Peretti P E R E T T I",
-        "yeah yeah like Double O Seven he’s supposed to do it",
+        "yeah yeah like Double O Seven he's supposed to do it",
        "P A P E R paper",
+        "[noise] okay_1 um let me see [laughter] i've been sitting here awhile",
    ]:
        print(text)
        print(normalizer.normalize(text))
@ -228,5 +278,6 @@ def test():


 if __name__ == "__main__":
-    # test(); exit()
+    test()
+    # exit()
    main()
--- a/egs/swbd/ASR/local/normalize_eval2000.py
+++ b/egs/swbd/ASR/local/normalize_eval2000.py
@ -83,9 +83,7 @@ def replace_silphone(text: str) -> str:
    text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ")
    text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]")
    text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]")
-    text = text.replace(
-        "[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " "
-    )
+    text = text.replace("[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " ")
    text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ")
    text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ")
    text = text.replace("[[PROLONGED]]", " ")
@ -181,6 +179,7 @@ def replace_silphone(text: str) -> str:
    text = text.replace("[LAUGHTER]", " ")
    text = text.replace("[NOISE]", " ")
    text = text.replace("[VOCALIZED-NOISE]", " ")
+    text = text.replace("-", " ")
    return text


@ -231,4 +230,4 @@ def main():


 if __name__ == "__main__":
-    main()
+    main()
--- a/egs/swbd/ASR/local/prepare_lang_bpe.py
+++ b/egs/swbd/ASR/local/prepare_lang_bpe.py
@ -210,15 +210,15 @@ def main():

    excluded = [
        "<eps>",
-        "!sil",
-        "<spoken_noise>",
+        "!SIL",
+        "<SPOKEN_NOISE>",
        args.oov,
        "#0",
        "<s>",
        "</s>",
-        "[vocalized-noise]",
-        "[noise]",
-        "[laughter]",
+        "[VOCALIZED-NOISE]",
+        "[NOISE]",
+        "[LAUGHTER]",
    ]

    for w in excluded:
--- a/egs/swbd/ASR/local/swbd1_prepare_dict.sh
+++ b/egs/swbd/ASR/local/swbd1_prepare_dict.sh
@ -46,11 +46,11 @@ cp local/MSU_single_letter.txt $dir/
 # The original swbd lexicon does not have precise single letter lexicion
 # e.g. it does not have entry of W
 (
-    echo '!sil sil'
-    echo '[vocalized-noise] spn'
-    echo '[noise] nsn'
-    echo '[laughter] lau'
-    echo '<unk> spn'
+    echo '!SIL SIL'
+    echo '[VOCALIZED-NOISE] spn'
+    echo '[NOISE] nsn'
+    echo '[LAUGHTER] lau'
+    echo '<UNK> spn'
 ) |
    cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1

--- a/egs/swbd/ASR/prepare.sh
+++ b/egs/swbd/ASR/prepare.sh
@ -43,9 +43,9 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19"
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-    5000
-    2000
-    1000
+    # 5000
+    # 2000
+    # 1000
    500
 )

@ -73,6 +73,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
        ./local/normalize_and_filter_supervisions.py \
            data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
            data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz
+        mv data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/swbd/swbd_supervisions_orig.jsonl.gz
        mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz

        lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000
@ -149,8 +150,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
        | jq '.text' | sed 's/"//g'  > $lang_dir/text
    fi

-    log "prepare dict"
-    ./local/swbd1_prepare_dict.sh $swbd1_dir
+    log "Prepare dict"
+    ./local/swbd1_prepare_dict.sh 
    cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt
    # [noise] nsn
    # !sil sil
@ -336,6 +337,10 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
        out_dir=data/lm_training_bpe_${vocab_size}
        mkdir -p $out_dir

+        if [ ! -f $out_dir/train.txt ]; then
+              tail -n 250000 data/lang_phone/input.txt > $out_dir/train.txt
+        fi
+
        ./local/prepare_lm_training_data.py \
            --bpe-model $lang_dir/bpe.model \
            --lm-data data/lang_phone/input.txt \
@ -343,29 +348,29 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
    done
 fi

-# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
-#     log "Stage 12: Generate LM validation data"
+if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
+    log "Stage 12: Generate LM validation data"

-#     for vocab_size in ${vocab_sizes[@]}; do
-#         log "Processing vocab_size == ${vocab_size}"
-#         out_dir=data/lm_training_bpe_${vocab_size}
-#         mkdir -p $out_dir
+    for vocab_size in ${vocab_sizes[@]}; do
+        log "Processing vocab_size == ${vocab_size}"
+        out_dir=data/lm_training_bpe_${vocab_size}
+        mkdir -p $out_dir

-#         if [ ! -f $out_dir/valid.txt ]; then
-#               TODO: generate valid.txt
-#         fi
+        if [ ! -f $out_dir/valid.txt ]; then
+            head -n 14332 data/lang_phone/input.txt > $out_dir/valid.txt
+        fi

-#         lang_dir=data/lang_bpe_${vocab_size}
-#         ./local/prepare_lm_training_data.py \
-#             --bpe-model $lang_dir/bpe.model \
-#             --lm-data $out_dir/valid.txt \
-#             --lm-archive $out_dir/lm_data-valid.pt
-#     done
-# fi
+        lang_dir=data/lang_bpe_${vocab_size}
+        ./local/prepare_lm_training_data.py \
+            --bpe-model $lang_dir/bpe.model \
+            --lm-data $out_dir/valid.txt \
+            --lm-archive $out_dir/lm_data-valid.pt
+    done
+fi

 if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
    log "Stage 13: Generate LM test data"
-    testsets=(eval2000 rt03)
+    testsets=(eval2000)

    for testset in ${testsets[@]}; do
        for vocab_size in ${vocab_sizes[@]}; do
@ -373,8 +378,9 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
            out_dir=data/lm_training_bpe_${vocab_size}
            mkdir -p $out_dir

-            if [ ! -f $out_dir/test.txt ]; then
-                cat data/local/${testset}/text | cut -d " " -f 2- >$out_dir/${testset}.txt
+            if [ ! -f $out_dir/${testset}.txt ]; then
+                gunzip -c data/manifests/${testset}/eval2000_supervisions_all.jsonl.gz \
+                    | jq '.text' | sed 's/"//g' > $out_dir/${testset}.txt
            fi

            lang_dir=data/lang_bpe_${vocab_size}
@ -388,7 +394,7 @@ fi

 if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
    log "Stage 14: Sort LM training data"
-    testsets=(eval2000 rt03)
+    testsets=(eval2000)
    # Sort LM training data by sentence length in descending order
    # for ease of training.
    #