minor updates

2025-12-11 06:55:27 +00:00 · 2023-08-01 17:24:50 +08:00 · 2023-08-01 17:24:50 +08:00 · 099e789ba0
commit 099e789ba0
parent 11fe0004f4
3 changed files with 236 additions and 3 deletions
--- a/egs/swbd/ASR/local/normalize_and_filter_supervisions.py
+++ b/egs/swbd/ASR/local/normalize_and_filter_supervisions.py
@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+# Copyright    2023      (authors: Nagendra Goel https://github.com/ngoel17)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import re
+from typing import Tuple
+
+from lhotse import SupervisionSegment, SupervisionSet
+from lhotse.serialization import load_manifest_lazy_or_eager
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_sups")
+    parser.add_argument("output_sups")
+    return parser.parse_args()
+
+
+# replacement function to convert lowercase letter to uppercase
+def to_upper(match_obj):
+    if match_obj.group() is not None:
+        return match_obj.group().upper()
+
+
+def insert_groups_and_capitalize_3(match):
+    return f"{match.group(1)} {match.group(2)} {match.group(3)}".upper()
+
+
+def insert_groups_and_capitalize_2(match):
+    return f"{match.group(1)} {match.group(2)}".upper()
+
+
+def insert_groups_and_capitalize_1(match):
+    return f"{match.group(1)}".upper()
+
+
+def insert_groups_and_capitalize_1s(match):
+    return f"{match.group(1)}".upper() + "'s"
+
+
+# fmt: off
+class FisherSwbdNormalizer:
+    """Note: the functions "normalize" and "keep" implement the logic
+    similar to Kaldi's data prep scripts for Fisher and SWBD: One
+    notable difference is that we don't change [cough], [lipsmack],
+    etc. to [noise].  We also don't implement all the edge cases of
+    normalization from Kaldi (hopefully won't make too much
+    difference).
+    """
+    def __init__(self) -> None:
+
+        self.remove_regexp_before = re.compile(
+            r"|".join([
+                # special symbols
+                r"\[\[skip.*\]\]",
+                r"\[skip.*\]",
+                r"\[pause.*\]",
+                r"\[silence\]",
+                r"<b_aside>",
+                r"<e_aside>",
+            ])
+        )
+
+        # tuples of (pattern, replacement)
+        # note: Kaldi replaces sighs, coughs, etc with [noise].
+        #       We don't do that here.
+        #       We also lowercase the text as the first operation.
+        self.replace_regexps: Tuple[re.Pattern, str] = [
+            # SWBD:
+            # [LAUGHTER-STORY] -> STORY
+            (re.compile(r"\[laughter-(.*?)\]"), r"\1"),
+            # [WEA[SONABLE]-/REASONABLE]
+            (re.compile(r"\[\S+/(\S+)\]"), r"\1"),
+            # -[ADV]AN[TAGE]- -> AN
+            (re.compile(r"-?\[.*?\](\w+)\[.*?\]-?"), r"\1-"),
+            # ABSOLUTE[LY]- -> ABSOLUTE-
+            (re.compile(r"(\w+)\[.*?\]-?"), r"\1-"),
+            # [AN]Y- -> Y-
+            # -[AN]Y- -> Y-
+            (re.compile(r"-?\[.*?\](\w+)-?"), r"\1-"),
+            # special tokens
+            (re.compile(r"\[laugh.*?\]"), r"[laughter]"),
+            (re.compile(r"\[sigh.*?\]"), r"[sigh]"),
+            (re.compile(r"\[cough.*?\]"), r"[cough]"),
+            (re.compile(r"\[mn.*?\]"), r"[vocalized-noise]"),
+            (re.compile(r"\[breath.*?\]"), r"[breath]"),
+            (re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
+            (re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
+            # abbreviations
+            (re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
+            (re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
+            (re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
+            (re.compile(r"\._",), r" "),
+            (re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
+            (re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
+            (re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
+            (re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
+            # words between apostrophes
+            (re.compile(r"'(\S*?)'"), r"\1"),
+            # dangling dashes (2 passes)
+            (re.compile(r"\s-\s"), r" "),
+            (re.compile(r"\s-\s"), r" "),
+            # special symbol with trailing dash
+            (re.compile(r"(\[.*?\])-"), r"\1"),
+            # Just remove all dashes
+            (re.compile(r"-"), r" "),
+        ]
+
+        # unwanted symbols in the transcripts
+        self.remove_regexp_after = re.compile(
+            r"|".join([
+                # remaining punctuation
+                r"\.",
+                r",",
+                r"\?",
+                r"{",
+                r"}",
+                r"~",
+                r"_\d",
+            ])
+        )
+
+        self.whitespace_regexp = re.compile(r"\s+")
+
+    def normalize(self, text: str) -> str:
+        text = text.lower()
+
+        # first remove
+        text = self.remove_regexp_before.sub("", text)
+
+        # then replace
+        for pattern, sub in self.replace_regexps:
+            text = pattern.sub(sub, text)
+
+        # then remove
+        text = self.remove_regexp_after.sub("", text)
+
+        # then clean up whitespace
+        text = self.whitespace_regexp.sub(" ", text).strip()
+
+        return text
+# fmt: on
+
+
+def keep(sup: SupervisionSegment) -> bool:
+    if "((" in sup.text:
+        return False
+
+    if "<german" in sup.text:
+        return False
+
+    return True
+
+
+def main():
+    args = get_args()
+    sups = load_manifest_lazy_or_eager(args.input_sups)
+    assert isinstance(sups, SupervisionSet)
+
+    normalizer = FisherSwbdNormalizer()
+
+    tot, skip = 0, 0
+    with SupervisionSet.open_writer(args.output_sups) as writer:
+        for sup in tqdm(sups, desc="Normalizing supervisions"):
+            tot += 1
+
+            if not keep(sup):
+                skip += 1
+                continue
+
+            sup.text = normalizer.normalize(sup.text)
+            if not sup.text:
+                skip += 1
+                continue
+
+            writer.write(sup)
+
+
+def test():
+    normalizer = FisherSwbdNormalizer()
+    for text in [
+        "[laughterr] [SILENCE]",
+        "[laugh] oh this is great [silence] <B_ASIDE> yes",
+        "[laugh] oh this is [laught] this is great [silence] <B_ASIDE> yes",
+        "i don't kn- - know A.B.C's",
+        "so x. corp is good?",
+        "'absolutely yes",
+        "absolutely' yes",
+        "'absolutely' yes",
+        "'absolutely' yes 'aight",
+        "ABSOLUTE[LY]",
+        "ABSOLUTE[LY]-",
+        "[AN]Y",
+        "[AN]Y-",
+        "[ADV]AN[TAGE]",
+        "[ADV]AN[TAGE]-",
+        "-[ADV]AN[TAGE]",
+        "-[ADV]AN[TAGE]-",
+        "[WEA[SONABLE]-/REASONABLE]",
+        "[VOCALIZED-NOISE]-",
+        "~BULL",
+        "Frank E Peretti P E R E T T I",
+        "yeah yeah like Double O Seven he’s supposed to do it",
+        "P A P E R paper",
+    ]:
+        print(text)
+        print(normalizer.normalize(text))
+        print()
+
+
+if __name__ == "__main__":
+    test()
+#    main()
--- a/egs/swbd/ASR/local/normalize_eval2000.py
+++ b/egs/swbd/ASR/local/normalize_eval2000.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright    2023         (authors: Nagendra Goel https://github.com/ngoel17)
+# Copyright    2023      (authors: Nagendra Goel https://github.com/ngoel17)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
--- a/egs/swbd/ASR/prepare.sh
+++ b/egs/swbd/ASR/prepare.sh
@ -68,7 +68,12 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
    # to respective dirs
    mkdir -p data/manifests
    if [ ! -e data/manifests/.swbd.done ]; then
-        lhotse prepare switchboard --absolute-paths True $swbd1_dir data/manifests_train
+        lhotse prepare switchboard --absolute-paths 1 --omit-silence $swbd1_dir data/manifests/swbd
+        ./local/normalize_and_filter_supervisions.py \
+            data/manifests/swbd/swbd_supervisions.jsonl \
+            data/manifests/swbd/swbd_supervisions_norm.jsonl
+        cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl
+        
        ./local/swbd1_prepare_dict.sh
        ./local/swbd1_data_prep.sh $swbd1_dir
        lhotse kaldi import data/local/train 8000 data/manifests_train
@ -78,7 +83,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
        lhotse prepare $eval2000_dir data/manifests_eval2000
        ./local/normalize_eval2000.py \
            data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \
-            data/manifests_eval2000/eval2000_supervisions.jsonl.gz
+            data/manifests_eval2000/eval2000_supervisions_norm.jsonl.gz

        ./local/rt03_data_prep.sh $rt03_dir