From 84759a2244d4bcb829baeb0d3effbce60d9203de Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 16 Nov 2024 11:22:14 -0800 Subject: [PATCH] update text normalization update fix fix fix --- egs/libriheavy/TTS/local/prepare_manifest.py | 241 ++++++++++++++++++- egs/libriheavy/TTS/prepare.sh | 27 +-- 2 files changed, 241 insertions(+), 27 deletions(-) diff --git a/egs/libriheavy/TTS/local/prepare_manifest.py b/egs/libriheavy/TTS/local/prepare_manifest.py index f6e8f5a74..65d442f3a 100755 --- a/egs/libriheavy/TTS/local/prepare_manifest.py +++ b/egs/libriheavy/TTS/local/prepare_manifest.py @@ -28,24 +28,251 @@ from tqdm import tqdm from icefall.utils import str2bool +units = [ + "second", + "minute", + "quarter", + "hour", + "day", + "week", + "month", + "year", + "decade", + "century", + "millisecond", + "microsecond", + "nanosecond", + "am", + "a.m.", + "pm", + "p.m.", + "a.d.", + "a.d.", + "b.c.", + "bc", + "monday", + "mon", + "tuesday", + "tue", + "wednesday", + "wed", + "thursday", + "thu", + "friday", + "fri", + "saturday", + "sat", + "sunday", + "sun", + "january", + "jan", + "february", + "feb", + "march", + "mar", + "april", + "apr", + "may", + "jun", + "june", + "july", + "jul", + "august", + "aug", + "september", + "sep", + "october", + "oct", + "november", + "nov", + "december", + "dec", + "metre", + "meter", + "kilometer", + "centimeter", + "millimeter", + "micrometer", + "nanometer", + "inch", + "foot", + "feet", + "yard", + "mile", + "kilogram", + "gram", + "milligram", + "microgram", + "tonne", + "ton", + "pound", + "ounce", + "stone", + "carat", + "grain", + "cent", + "dollar", + "euro", + "pound", + "yen", + "celsius", + "fahrenheit", + "kelvin", + "square", + "acre", + "hectare", + "cubic", + "liter", + "milliliter", + "gallon", + "quart", + "pint", + "degree", + "radian", + "rad", + "percent", + "south", + "north", + "east", + "west", + "vote", + "passenger", + "fathom", + "intermediate", + "people", + "button", + "line", + "stitch", + "edge", + "time", + "vols.", +] + +pre_units = [ + "$", + "€", + "£", + "¥", +] + +post_units = [ + "°", + "%", + "s", + "ns", + "ms", + "min", + "h", + "d", + "wk", + "mo", + "yr", + "dec", + "cent", + "m", + "km", + "cm", + "mm", + "nm", + "in", + "ft", + "yd", + "mi", + "ly", + "kg", + "g", + "mg", + "t", + "tn", + "lb", + "oz", + "st", + "ct", + "gr", + "ha", + "ac", + "l", + "ml", + "gal", + "qt", + "pt", + "cc", + "°c", + "°f", + "k", + "hz", +] + +del_start_phrases = [ + "footnote", + "note", + "illustration", + "sidenote", + "page", +] + +del_mid_phrases = [ + "p.", + "page", + "Page", + "volumes", + "vol.", + "Vol.", + "edition", + "ed.", + "Edition", + "Ed.", +] + + class TextNormalizer: def __init__(self): self.en_tn_model = EnNormalizer(cache_dir="/tmp/tn", overwrite_cache=False) - self.table = str.maketrans("’‘,。;?!():-《》、“”【】", "'',.;?!(): <>/\"\"[]") + self.table = str.maketrans("’‘,。;?!():-《》、“”【】_", "'',.;?!(): <>/\"\"[] ") def __call__(self, cut): text = cut["supervisions"][0]["custom"]["texts"][0] - # Apply mappings text = text.translate(self.table) - # Process brackets - text = re.sub(r"\([^)]*\)|\{[^}]*\}|\[[^\]]*\]|<[^>]*>", " ", text) + text = re.sub(r"\(\d+\)|\{\d+\}|\[\d+\]|<\d+>", " ", text) - # Process backslash - text = re.sub(r"\\.", "", text) + text = re.sub(r"\[FN#\d+\]", " ", text) + + del_start_pattern = rf"(?i)[\{{\[<\(]\s*({'|'.join(del_start_phrases)})\b.*?[\}}>\]\)]|[\{{\[<\(]\s*({'|'.join(del_start_phrases)})\b.*?$" + text = re.sub(del_start_pattern, " ", text) + + pattern = r"\([^\)]*?\d+[^\)]*?\)|\{[^\}]*?\d+[^\}]*?\}|\[[^\]]*?\d+[^\]]*?\]|<[^>]*?\d+[^>]*?>" + del_mid_pattern = ( + r"(?:(?:^|\s)(?:" + "|".join(map(re.escape, del_mid_phrases)) + r")\b)" + ) + unit_pattern = ( + r"(?i)\b(" + + "|".join([re.escape(unit) + r"(?:s|es)?" for unit in units]) + + r")\b" + ) + pre_units_pattern = r"(?i)(" + "|".join(map(re.escape, pre_units)) + r")\d+" + post_units_pattern = r"(?i)\d+(" + "|".join(map(re.escape, post_units)) + r")" + + if (match := re.search(pattern, text)) is not None: + content = match.group(0) + if re.search(del_mid_pattern, content) or not ( + re.search(unit_pattern, content) + or re.search(pre_units_pattern, content) + or re.search(post_units_pattern, content) + ): + text = text.replace(content, " ") + + text = re.sub(r"\b\d+:\d{3,}\b", "", text) + text = re.sub(r"\b\d+:\b\d+:\d{3,}\b", "", text) + + text = re.sub(r"\\\"", "", text) + text = re.sub(r"\\\'", "", text) + text = re.sub(r"\\", "", text) + + text = re.sub(r"\.{3,}", "…", text) + text = re.sub(r"[^\w\s.,!?;:…\'']", " ", text) - # Remove extra spaces text = re.sub(r"\s+", " ", text).strip() if len(text) == 0: diff --git a/egs/libriheavy/TTS/prepare.sh b/egs/libriheavy/TTS/prepare.sh index d0cceab4f..9cda6a721 100755 --- a/egs/libriheavy/TTS/prepare.sh +++ b/egs/libriheavy/TTS/prepare.sh @@ -92,22 +92,9 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then done fi -num_per_split=200000 -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Split medium and large subsets." - for subset in medium large; do - log "Spliting subset : $subset" - split_dir=$manifests_dir/libriheavy_${subset}_split - mkdir -p $split_dir - if [ ! -e $split_dir/.split_completed ]; then - lhotse split-lazy $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz $split_dir $num_per_split - touch $split_dir/.split_completed - fi - done -fi -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Train BPE model for normalized text" +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Train BPE model for normalized text" if [ ! -f data/texts ]; then gunzip -c $manifests_dir/libriheavy_cuts_medium.jsonl.gz \ @@ -130,8 +117,8 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then done fi -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Extract speech tokens." +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Extract speech tokens." mkdir -p $tokens_dir for subset in small medium large; do if [ ! -e $tokens_dir/libriheavy_${subset}.jsonl.gz ]; then @@ -149,13 +136,13 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then --batch_size 32 \ --model "speech_tokenizer_v1" - cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir + cat $output_dir/part* | gzip > $tokens_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir fi done fi -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Attach speech tokens." +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Attach speech tokens." for subset in small medium large; do log "Attach speech tokens for subset: $subset" if [ ! -e $tokens_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then