mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-09 17:14:20 +00:00
minor updates
This commit is contained in:
parent
11fe0004f4
commit
099e789ba0
228
egs/swbd/ASR/local/normalize_and_filter_supervisions.py
Normal file
228
egs/swbd/ASR/local/normalize_and_filter_supervisions.py
Normal file
@ -0,0 +1,228 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2023 (authors: Nagendra Goel https://github.com/ngoel17)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from lhotse import SupervisionSegment, SupervisionSet
|
||||||
|
from lhotse.serialization import load_manifest_lazy_or_eager
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("input_sups")
|
||||||
|
parser.add_argument("output_sups")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
# replacement function to convert lowercase letter to uppercase
|
||||||
|
def to_upper(match_obj):
|
||||||
|
if match_obj.group() is not None:
|
||||||
|
return match_obj.group().upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_3(match):
|
||||||
|
return f"{match.group(1)} {match.group(2)} {match.group(3)}".upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_2(match):
|
||||||
|
return f"{match.group(1)} {match.group(2)}".upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_1(match):
|
||||||
|
return f"{match.group(1)}".upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_1s(match):
|
||||||
|
return f"{match.group(1)}".upper() + "'s"
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
class FisherSwbdNormalizer:
|
||||||
|
"""Note: the functions "normalize" and "keep" implement the logic
|
||||||
|
similar to Kaldi's data prep scripts for Fisher and SWBD: One
|
||||||
|
notable difference is that we don't change [cough], [lipsmack],
|
||||||
|
etc. to [noise]. We also don't implement all the edge cases of
|
||||||
|
normalization from Kaldi (hopefully won't make too much
|
||||||
|
difference).
|
||||||
|
"""
|
||||||
|
def __init__(self) -> None:
|
||||||
|
|
||||||
|
self.remove_regexp_before = re.compile(
|
||||||
|
r"|".join([
|
||||||
|
# special symbols
|
||||||
|
r"\[\[skip.*\]\]",
|
||||||
|
r"\[skip.*\]",
|
||||||
|
r"\[pause.*\]",
|
||||||
|
r"\[silence\]",
|
||||||
|
r"<b_aside>",
|
||||||
|
r"<e_aside>",
|
||||||
|
])
|
||||||
|
)
|
||||||
|
|
||||||
|
# tuples of (pattern, replacement)
|
||||||
|
# note: Kaldi replaces sighs, coughs, etc with [noise].
|
||||||
|
# We don't do that here.
|
||||||
|
# We also lowercase the text as the first operation.
|
||||||
|
self.replace_regexps: Tuple[re.Pattern, str] = [
|
||||||
|
# SWBD:
|
||||||
|
# [LAUGHTER-STORY] -> STORY
|
||||||
|
(re.compile(r"\[laughter-(.*?)\]"), r"\1"),
|
||||||
|
# [WEA[SONABLE]-/REASONABLE]
|
||||||
|
(re.compile(r"\[\S+/(\S+)\]"), r"\1"),
|
||||||
|
# -[ADV]AN[TAGE]- -> AN
|
||||||
|
(re.compile(r"-?\[.*?\](\w+)\[.*?\]-?"), r"\1-"),
|
||||||
|
# ABSOLUTE[LY]- -> ABSOLUTE-
|
||||||
|
(re.compile(r"(\w+)\[.*?\]-?"), r"\1-"),
|
||||||
|
# [AN]Y- -> Y-
|
||||||
|
# -[AN]Y- -> Y-
|
||||||
|
(re.compile(r"-?\[.*?\](\w+)-?"), r"\1-"),
|
||||||
|
# special tokens
|
||||||
|
(re.compile(r"\[laugh.*?\]"), r"[laughter]"),
|
||||||
|
(re.compile(r"\[sigh.*?\]"), r"[sigh]"),
|
||||||
|
(re.compile(r"\[cough.*?\]"), r"[cough]"),
|
||||||
|
(re.compile(r"\[mn.*?\]"), r"[vocalized-noise]"),
|
||||||
|
(re.compile(r"\[breath.*?\]"), r"[breath]"),
|
||||||
|
(re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
|
||||||
|
(re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
|
||||||
|
# abbreviations
|
||||||
|
(re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
|
||||||
|
(re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
|
||||||
|
(re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
|
||||||
|
(re.compile(r"\._",), r" "),
|
||||||
|
(re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
|
||||||
|
(re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
|
||||||
|
(re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
|
||||||
|
(re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
|
||||||
|
# words between apostrophes
|
||||||
|
(re.compile(r"'(\S*?)'"), r"\1"),
|
||||||
|
# dangling dashes (2 passes)
|
||||||
|
(re.compile(r"\s-\s"), r" "),
|
||||||
|
(re.compile(r"\s-\s"), r" "),
|
||||||
|
# special symbol with trailing dash
|
||||||
|
(re.compile(r"(\[.*?\])-"), r"\1"),
|
||||||
|
# Just remove all dashes
|
||||||
|
(re.compile(r"-"), r" "),
|
||||||
|
]
|
||||||
|
|
||||||
|
# unwanted symbols in the transcripts
|
||||||
|
self.remove_regexp_after = re.compile(
|
||||||
|
r"|".join([
|
||||||
|
# remaining punctuation
|
||||||
|
r"\.",
|
||||||
|
r",",
|
||||||
|
r"\?",
|
||||||
|
r"{",
|
||||||
|
r"}",
|
||||||
|
r"~",
|
||||||
|
r"_\d",
|
||||||
|
])
|
||||||
|
)
|
||||||
|
|
||||||
|
self.whitespace_regexp = re.compile(r"\s+")
|
||||||
|
|
||||||
|
def normalize(self, text: str) -> str:
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# first remove
|
||||||
|
text = self.remove_regexp_before.sub("", text)
|
||||||
|
|
||||||
|
# then replace
|
||||||
|
for pattern, sub in self.replace_regexps:
|
||||||
|
text = pattern.sub(sub, text)
|
||||||
|
|
||||||
|
# then remove
|
||||||
|
text = self.remove_regexp_after.sub("", text)
|
||||||
|
|
||||||
|
# then clean up whitespace
|
||||||
|
text = self.whitespace_regexp.sub(" ", text).strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
def keep(sup: SupervisionSegment) -> bool:
|
||||||
|
if "((" in sup.text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "<german" in sup.text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
sups = load_manifest_lazy_or_eager(args.input_sups)
|
||||||
|
assert isinstance(sups, SupervisionSet)
|
||||||
|
|
||||||
|
normalizer = FisherSwbdNormalizer()
|
||||||
|
|
||||||
|
tot, skip = 0, 0
|
||||||
|
with SupervisionSet.open_writer(args.output_sups) as writer:
|
||||||
|
for sup in tqdm(sups, desc="Normalizing supervisions"):
|
||||||
|
tot += 1
|
||||||
|
|
||||||
|
if not keep(sup):
|
||||||
|
skip += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
sup.text = normalizer.normalize(sup.text)
|
||||||
|
if not sup.text:
|
||||||
|
skip += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
writer.write(sup)
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
normalizer = FisherSwbdNormalizer()
|
||||||
|
for text in [
|
||||||
|
"[laughterr] [SILENCE]",
|
||||||
|
"[laugh] oh this is great [silence] <B_ASIDE> yes",
|
||||||
|
"[laugh] oh this is [laught] this is great [silence] <B_ASIDE> yes",
|
||||||
|
"i don't kn- - know A.B.C's",
|
||||||
|
"so x. corp is good?",
|
||||||
|
"'absolutely yes",
|
||||||
|
"absolutely' yes",
|
||||||
|
"'absolutely' yes",
|
||||||
|
"'absolutely' yes 'aight",
|
||||||
|
"ABSOLUTE[LY]",
|
||||||
|
"ABSOLUTE[LY]-",
|
||||||
|
"[AN]Y",
|
||||||
|
"[AN]Y-",
|
||||||
|
"[ADV]AN[TAGE]",
|
||||||
|
"[ADV]AN[TAGE]-",
|
||||||
|
"-[ADV]AN[TAGE]",
|
||||||
|
"-[ADV]AN[TAGE]-",
|
||||||
|
"[WEA[SONABLE]-/REASONABLE]",
|
||||||
|
"[VOCALIZED-NOISE]-",
|
||||||
|
"~BULL",
|
||||||
|
"Frank E Peretti P E R E T T I",
|
||||||
|
"yeah yeah like Double O Seven he’s supposed to do it",
|
||||||
|
"P A P E R paper",
|
||||||
|
]:
|
||||||
|
print(text)
|
||||||
|
print(normalizer.normalize(text))
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test()
|
||||||
|
# main()
|
@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 (authors: Nagendra Goel https://github.com/ngoel17)
|
# Copyright 2023 (authors: Nagendra Goel https://github.com/ngoel17)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
|
@ -68,7 +68,12 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
# to respective dirs
|
# to respective dirs
|
||||||
mkdir -p data/manifests
|
mkdir -p data/manifests
|
||||||
if [ ! -e data/manifests/.swbd.done ]; then
|
if [ ! -e data/manifests/.swbd.done ]; then
|
||||||
lhotse prepare switchboard --absolute-paths True $swbd1_dir data/manifests_train
|
lhotse prepare switchboard --absolute-paths 1 --omit-silence $swbd1_dir data/manifests/swbd
|
||||||
|
./local/normalize_and_filter_supervisions.py \
|
||||||
|
data/manifests/swbd/swbd_supervisions.jsonl \
|
||||||
|
data/manifests/swbd/swbd_supervisions_norm.jsonl
|
||||||
|
cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl
|
||||||
|
|
||||||
./local/swbd1_prepare_dict.sh
|
./local/swbd1_prepare_dict.sh
|
||||||
./local/swbd1_data_prep.sh $swbd1_dir
|
./local/swbd1_data_prep.sh $swbd1_dir
|
||||||
lhotse kaldi import data/local/train 8000 data/manifests_train
|
lhotse kaldi import data/local/train 8000 data/manifests_train
|
||||||
@ -78,7 +83,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
lhotse prepare $eval2000_dir data/manifests_eval2000
|
lhotse prepare $eval2000_dir data/manifests_eval2000
|
||||||
./local/normalize_eval2000.py \
|
./local/normalize_eval2000.py \
|
||||||
data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \
|
data/manifests_eval2000/eval2000_supervisions_unnorm.jsonl.gz \
|
||||||
data/manifests_eval2000/eval2000_supervisions.jsonl.gz
|
data/manifests_eval2000/eval2000_supervisions_norm.jsonl.gz
|
||||||
|
|
||||||
./local/rt03_data_prep.sh $rt03_dir
|
./local/rt03_data_prep.sh $rt03_dir
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user