update text normalization

update

fix

fix

fix
This commit is contained in:
Your Name 2024-11-16 11:22:14 -08:00
parent 49150ff9ab
commit 84759a2244
2 changed files with 241 additions and 27 deletions

View File

@ -28,24 +28,251 @@ from tqdm import tqdm
from icefall.utils import str2bool
units = [
"second",
"minute",
"quarter",
"hour",
"day",
"week",
"month",
"year",
"decade",
"century",
"millisecond",
"microsecond",
"nanosecond",
"am",
"a.m.",
"pm",
"p.m.",
"a.d.",
"a.d.",
"b.c.",
"bc",
"monday",
"mon",
"tuesday",
"tue",
"wednesday",
"wed",
"thursday",
"thu",
"friday",
"fri",
"saturday",
"sat",
"sunday",
"sun",
"january",
"jan",
"february",
"feb",
"march",
"mar",
"april",
"apr",
"may",
"jun",
"june",
"july",
"jul",
"august",
"aug",
"september",
"sep",
"october",
"oct",
"november",
"nov",
"december",
"dec",
"metre",
"meter",
"kilometer",
"centimeter",
"millimeter",
"micrometer",
"nanometer",
"inch",
"foot",
"feet",
"yard",
"mile",
"kilogram",
"gram",
"milligram",
"microgram",
"tonne",
"ton",
"pound",
"ounce",
"stone",
"carat",
"grain",
"cent",
"dollar",
"euro",
"pound",
"yen",
"celsius",
"fahrenheit",
"kelvin",
"square",
"acre",
"hectare",
"cubic",
"liter",
"milliliter",
"gallon",
"quart",
"pint",
"degree",
"radian",
"rad",
"percent",
"south",
"north",
"east",
"west",
"vote",
"passenger",
"fathom",
"intermediate",
"people",
"button",
"line",
"stitch",
"edge",
"time",
"vols.",
]
pre_units = [
"$",
"",
"£",
"¥",
]
post_units = [
"°",
"%",
"s",
"ns",
"ms",
"min",
"h",
"d",
"wk",
"mo",
"yr",
"dec",
"cent",
"m",
"km",
"cm",
"mm",
"nm",
"in",
"ft",
"yd",
"mi",
"ly",
"kg",
"g",
"mg",
"t",
"tn",
"lb",
"oz",
"st",
"ct",
"gr",
"ha",
"ac",
"l",
"ml",
"gal",
"qt",
"pt",
"cc",
"°c",
"°f",
"k",
"hz",
]
del_start_phrases = [
"footnote",
"note",
"illustration",
"sidenote",
"page",
]
del_mid_phrases = [
"p.",
"page",
"Page",
"volumes",
"vol.",
"Vol.",
"edition",
"ed.",
"Edition",
"Ed.",
]
class TextNormalizer:
def __init__(self):
self.en_tn_model = EnNormalizer(cache_dir="/tmp/tn", overwrite_cache=False)
self.table = str.maketrans("’‘,。;?!():-《》、“”【】", "'',.;?!(): <>/\"\"[]")
self.table = str.maketrans("’‘,。;?!():-《》、“”【】_", "'',.;?!(): <>/\"\"[] ")
def __call__(self, cut):
text = cut["supervisions"][0]["custom"]["texts"][0]
# Apply mappings
text = text.translate(self.table)
# Process brackets
text = re.sub(r"\([^)]*\)|\{[^}]*\}|\[[^\]]*\]|<[^>]*>", " ", text)
text = re.sub(r"\(\d+\)|\{\d+\}|\[\d+\]|<\d+>", " ", text)
# Process backslash
text = re.sub(r"\\.", "", text)
text = re.sub(r"\[FN#\d+\]", " ", text)
del_start_pattern = rf"(?i)[\{{\[<\(]\s*({'|'.join(del_start_phrases)})\b.*?[\}}>\]\)]|[\{{\[<\(]\s*({'|'.join(del_start_phrases)})\b.*?$"
text = re.sub(del_start_pattern, " ", text)
pattern = r"\([^\)]*?\d+[^\)]*?\)|\{[^\}]*?\d+[^\}]*?\}|\[[^\]]*?\d+[^\]]*?\]|<[^>]*?\d+[^>]*?>"
del_mid_pattern = (
r"(?:(?:^|\s)(?:" + "|".join(map(re.escape, del_mid_phrases)) + r")\b)"
)
unit_pattern = (
r"(?i)\b("
+ "|".join([re.escape(unit) + r"(?:s|es)?" for unit in units])
+ r")\b"
)
pre_units_pattern = r"(?i)(" + "|".join(map(re.escape, pre_units)) + r")\d+"
post_units_pattern = r"(?i)\d+(" + "|".join(map(re.escape, post_units)) + r")"
if (match := re.search(pattern, text)) is not None:
content = match.group(0)
if re.search(del_mid_pattern, content) or not (
re.search(unit_pattern, content)
or re.search(pre_units_pattern, content)
or re.search(post_units_pattern, content)
):
text = text.replace(content, " ")
text = re.sub(r"\b\d+:\d{3,}\b", "", text)
text = re.sub(r"\b\d+:\b\d+:\d{3,}\b", "", text)
text = re.sub(r"\\\"", "", text)
text = re.sub(r"\\\'", "", text)
text = re.sub(r"\\", "", text)
text = re.sub(r"\.{3,}", "", text)
text = re.sub(r"[^\w\s.,!?;:…\'']", " ", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text).strip()
if len(text) == 0:

View File

@ -92,22 +92,9 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
done
fi
num_per_split=200000
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Split medium and large subsets."
for subset in medium large; do
log "Spliting subset : $subset"
split_dir=$manifests_dir/libriheavy_${subset}_split
mkdir -p $split_dir
if [ ! -e $split_dir/.split_completed ]; then
lhotse split-lazy $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz $split_dir $num_per_split
touch $split_dir/.split_completed
fi
done
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Train BPE model for normalized text"
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Train BPE model for normalized text"
if [ ! -f data/texts ]; then
gunzip -c $manifests_dir/libriheavy_cuts_medium.jsonl.gz \
@ -130,8 +117,8 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
done
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Extract speech tokens."
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Extract speech tokens."
mkdir -p $tokens_dir
for subset in small medium large; do
if [ ! -e $tokens_dir/libriheavy_${subset}.jsonl.gz ]; then
@ -149,13 +136,13 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
--batch_size 32 \
--model "speech_tokenizer_v1"
cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
cat $output_dir/part* | gzip > $tokens_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
fi
done
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Attach speech tokens."
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Attach speech tokens."
for subset in small medium large; do
log "Attach speech tokens for subset: $subset"
if [ ! -e $tokens_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then