minor updates

2025-12-11 06:55:27 +00:00 · 2023-08-01 17:24:50 +08:00 · 2023-08-01 17:24:50 +08:00 · 099e789ba0
commit 099e789ba0
parent 11fe0004f4
3 changed files with 236 additions and 3 deletions
--- a/egs/swbd/ASR/local/normalize_and_filter_supervisions.py
+++ b/egs/swbd/ASR/local/normalize_and_filter_supervisions.py
@ -0,0 +1,228 @@
 #!/usr/bin/env python3
 # Copyright    2023      (authors: Nagendra Goel https://github.com/ngoel17)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import re
 from typing import Tuple
 from lhotse import SupervisionSegment, SupervisionSet
 from lhotse.serialization import load_manifest_lazy_or_eager
 from tqdm import tqdm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_sups")
    parser.add_argument("output_sups")
    return parser.parse_args()
 # replacement function to convert lowercase letter to uppercase
 def to_upper(match_obj):
    if match_obj.group() is not None:
        return match_obj.group().upper()
 def insert_groups_and_capitalize_3(match):
    return f"{match.group(1)} {match.group(2)} {match.group(3)}".upper()
 def insert_groups_and_capitalize_2(match):
    return f"{match.group(1)} {match.group(2)}".upper()
 def insert_groups_and_capitalize_1(match):
    return f"{match.group(1)}".upper()
 def insert_groups_and_capitalize_1s(match):
    return f"{match.group(1)}".upper() + "'s"
 # fmt: off
 class FisherSwbdNormalizer:
    """Note: the functions "normalize" and "keep" implement the logic
    similar to Kaldi's data prep scripts for Fisher and SWBD: One
    notable difference is that we don't change [cough], [lipsmack],
    etc. to [noise].  We also don't implement all the edge cases of
    normalization from Kaldi (hopefully won't make too much
    difference).
    """
    def __init__(self) -> None:
        self.remove_regexp_before = re.compile(
            r"|".join([
                # special symbols
                r"\[\[skip.*\]\]",
                r"\[skip.*\]",
                r"\[pause.*\]",
                r"\[silence\]",
                r"<b_aside>",
                r"<e_aside>",
            ])
        )
        # tuples of (pattern, replacement)
        # note: Kaldi replaces sighs, coughs, etc with [noise].
        #       We don't do that here.
        #       We also lowercase the text as the first operation.
        self.replace_regexps: Tuple[re.Pattern, str] = [
            # SWBD:
            # [LAUGHTER-STORY] -> STORY
            (re.compile(r"\[laughter-(.*?)\]"), r"\1"),
            # [WEA[SONABLE]-/REASONABLE]
            (re.compile(r"\[\S+/(\S+)\]"), r"\1"),
            # -[ADV]AN[TAGE]- -> AN
            (re.compile(r"-?\[.*?\](\w+)\[.*?\]-?"), r"\1-"),
            # ABSOLUTE[LY]- -> ABSOLUTE-
            (re.compile(r"(\w+)\[.*?\]-?"), r"\1-"),
            # [AN]Y- -> Y-
            # -[AN]Y- -> Y-
            (re.compile(r"-?\[.*?\](\w+)-?"), r"\1-"),
            # special tokens
            (re.compile(r"\[laugh.*?\]"), r"[laughter]"),
            (re.compile(r"\[sigh.*?\]"), r"[sigh]"),
            (re.compile(r"\[cough.*?\]"), r"[cough]"),
            (re.compile(r"\[mn.*?\]"), r"[vocalized-noise]"),
            (re.compile(r"\[breath.*?\]"), r"[breath]"),
            (re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
            (re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
            # abbreviations
            (re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
            (re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
            (re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
            (re.compile(r"\._",), r" "),
            (re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
            (re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
            (re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
            (re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
            # words between apostrophes
            (re.compile(r"'(\S*?)'"), r"\1"),
            # dangling dashes (2 passes)
            (re.compile(r"\s-\s"), r" "),
            (re.compile(r"\s-\s"), r" "),
            # special symbol with trailing dash
            (re.compile(r"(\[.*?\])-"), r"\1"),
            # Just remove all dashes
            (re.compile(r"-"), r" "),
        ]
        # unwanted symbols in the transcripts
        self.remove_regexp_after = re.compile(
            r"|".join([
                # remaining punctuation
                r"\.",
                r",",
                r"\?",
                r"{",
                r"}",
                r"~",
                r"_\d",
            ])
        )
        self.whitespace_regexp = re.compile(r"\s+")
    def normalize(self, text: str) -> str:
        text = text.lower()
        # first remove
        text = self.remove_regexp_before.sub("", text)
        # then replace
        for pattern, sub in self.replace_regexps:
            text = pattern.sub(sub, text)
        # then remove
        text = self.remove_regexp_after.sub("", text)
        # then clean up whitespace
        text = self.whitespace_regexp.sub(" ", text).strip()
        return text
 # fmt: on
 def keep(sup: SupervisionSegment) -> bool:
    if "((" in sup.text:
        return False
    if "<german" in sup.text:
        return False
    return True
 def main():
    args = get_args()
    sups = load_manifest_lazy_or_eager(args.input_sups)
    assert isinstance(sups, SupervisionSet)
    normalizer = FisherSwbdNormalizer()
    tot, skip = 0, 0
    with SupervisionSet.open_writer(args.output_sups) as writer:
        for sup in tqdm(sups, desc="Normalizing supervisions"):
            tot += 1
            if not keep(sup):
                skip += 1
                continue
            sup.text = normalizer.normalize(sup.text)
            if not sup.text:
                skip += 1
                continue
            writer.write(sup)
 def test():
    normalizer = FisherSwbdNormalizer()
    for text in [
        "[laughterr] [SILENCE]",
        "[laugh] oh this is great [silence] <B_ASIDE> yes",
        "[laugh] oh this is [laught] this is great [silence] <B_ASIDE> yes",
        "i don't kn- - know A.B.C's",
        "so x. corp is good?",
        "'absolutely yes",
        "absolutely' yes",
        "'absolutely' yes",
        "'absolutely' yes 'aight",
        "ABSOLUTE[LY]",
        "ABSOLUTE[LY]-",
        "[AN]Y",
        "[AN]Y-",
        "[ADV]AN[TAGE]",
        "[ADV]AN[TAGE]-",
        "-[ADV]AN[TAGE]",
        "-[ADV]AN[TAGE]-",
        "[WEA[SONABLE]-/REASONABLE]",
        "[VOCALIZED-NOISE]-",
        "~BULL",
        "Frank E Peretti P E R E T T I",
        "yeah yeah like Double O Seven he’s supposed to do it",
        "P A P E R paper",
    ]:
        print(text)
        print(normalizer.normalize(text))
        print()
 if __name__ == "__main__":
    test()
 #    main()
--- a/egs/swbd/ASR/local/normalize_eval2000.py
+++ b/egs/swbd/ASR/local/normalize_eval2000.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright    2023         (authors: Nagendra Goel https://github.com/ngoel17)
+# Copyright    2023      (authors: Nagendra Goel https://github.com/ngoel17)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
--- a/egs/swbd/ASR/prepare.sh
+++ b/egs/swbd/ASR/prepare.sh
@ -68,7 +68,12 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
    # to respective dirs
    mkdir -p data/manifests
    if [ ! -e data/manifests/.swbd.done ]; then
-        lhotse prepare switchboard --absolute-paths True $swbd1_dir data/manifests_train
+        lhotse prepare switchboard --absolute-paths 1 --omit-silence $swbd1_dir data/manifests/swbd
        ./local/normalize_and_filter_supervisions.py \
            data/manifests/swbd/swbd_supervisions.jsonl \
            data/manifests/swbd/swbd_supervisions_norm.jsonl
        cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl
        ./local/swbd1_prepare_dict.sh
        ./local/swbd1_data_prep.sh $swbd1_dir
        lhotse kaldi import data/local/train 8000 data/manifests_train
@ -78,7 +83,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
        lhotse prepare $eval2000_dir data/manifests_eval2000
        ./local/normalize_eval2000.py \
            data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \
-            data/manifests_eval2000/eval2000_supervisions.jsonl.gz
+            data/manifests_eval2000/eval2000_supervisions_norm.jsonl.gz
        ./local/rt03_data_prep.sh $rt03_dir