minor updates

This commit is contained in:
JinZr 2023-08-18 11:19:08 +08:00
parent 58d9088010
commit ab07e58613
5 changed files with 131 additions and 75 deletions

View File

@ -53,7 +53,6 @@ def insert_groups_and_capitalize_1s(match):
return f"{match.group(1)}".upper() + "'s"
# fmt: off
class FisherSwbdNormalizer:
"""Note: the functions "normalize" and "keep" implement the logic
similar to Kaldi's data prep scripts for Fisher and SWBD: One
@ -62,18 +61,21 @@ class FisherSwbdNormalizer:
normalization from Kaldi (hopefully won't make too much
difference).
"""
def __init__(self) -> None:
def __init__(self) -> None:
self.remove_regexp_before = re.compile(
r"|".join([
# special symbols
r"\[\[skip.*\]\]",
r"\[skip.*\]",
r"\[pause.*\]",
r"\[silence\]",
r"<b_aside>",
r"<e_aside>",
])
r"|".join(
[
# special symbols
r"\[\[skip.*\]\]",
r"\[skip.*\]",
r"\[pause.*\]",
r"\[silence\]",
r"<b_aside>",
r"<e_aside>",
r"_1",
]
)
)
# tuples of (pattern, replacement)
@ -102,14 +104,54 @@ class FisherSwbdNormalizer:
(re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
(re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
# abbreviations
(re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
(re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
(re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
(re.compile(r"\._",), r" "),
(re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
(re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
(re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
(re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
(
re.compile(
r"(\w)\.(\w)\.(\w)",
),
insert_groups_and_capitalize_3,
),
(
re.compile(
r"(\w)\.(\w)",
),
insert_groups_and_capitalize_2,
),
(
re.compile(
r"([a-h,j-z])\.",
),
insert_groups_and_capitalize_1,
),
(
re.compile(
r"\._",
),
r" ",
),
(
re.compile(
r"_(\w)",
),
insert_groups_and_capitalize_1,
),
(
re.compile(
r"(\w)\.s",
),
insert_groups_and_capitalize_1s,
),
(
re.compile(
r"([A-Z])\'s",
),
insert_groups_and_capitalize_1s,
),
(
re.compile(
r"(\s\w\b|^\w\b)",
),
insert_groups_and_capitalize_1,
),
# words between apostrophes
(re.compile(r"'(\S*?)'"), r"\1"),
# dangling dashes (2 passes)
@ -119,25 +161,29 @@ class FisherSwbdNormalizer:
(re.compile(r"(\[.*?\])-"), r"\1"),
# Just remove all dashes
(re.compile(r"-"), r" "),
# Fix an issue related to [vocalized-noise]
(re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"),
]
# unwanted symbols in the transcripts
self.remove_regexp_after = re.compile(
r"|".join([
# remaining punctuation
r"\.",
r",",
r"\?",
r"{",
r"}",
r"~",
r"_\d",
])
r"|".join(
[
# remaining punctuation
r"\.",
r",",
r"\?",
r"{",
r"}",
r"~",
r"_\d",
]
)
)
self.post_fixes = [
# Fix an issue related to [VOCALIZED NOISE] after dash removal
(re.compile(r"\[vocalized noise\]"), "[vocalized-noise]"),
]
self.whitespace_regexp = re.compile(r"\s+")
def normalize(self, text: str) -> str:
@ -153,11 +199,14 @@ class FisherSwbdNormalizer:
# then remove
text = self.remove_regexp_after.sub("", text)
# post fixes
for pattern, sub in self.post_fixes:
text = pattern.sub(sub, text)
# then clean up whitespace
text = self.whitespace_regexp.sub(" ", text).strip()
return text.upper()
# fmt: on
def keep(sup: SupervisionSegment) -> bool:
@ -186,7 +235,7 @@ def main():
skip += 1
continue
sup.text = normalizer.normalize(sup.text)
sup.text = normalizer.normalize(sup.text).upper()
if not sup.text:
skip += 1
continue
@ -219,8 +268,9 @@ def test():
"[VOCALIZED-NOISE]-",
"~BULL",
"Frank E Peretti P E R E T T I",
"yeah yeah like Double O Seven hes supposed to do it",
"yeah yeah like Double O Seven he's supposed to do it",
"P A P E R paper",
"[noise] okay_1 um let me see [laughter] i've been sitting here awhile",
]:
print(text)
print(normalizer.normalize(text))
@ -228,5 +278,6 @@ def test():
if __name__ == "__main__":
# test(); exit()
test()
# exit()
main()

View File

@ -83,9 +83,7 @@ def replace_silphone(text: str) -> str:
text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ")
text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]")
text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]")
text = text.replace(
"[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " "
)
text = text.replace("[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " ")
text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ")
text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ")
text = text.replace("[[PROLONGED]]", " ")
@ -181,6 +179,7 @@ def replace_silphone(text: str) -> str:
text = text.replace("[LAUGHTER]", " ")
text = text.replace("[NOISE]", " ")
text = text.replace("[VOCALIZED-NOISE]", " ")
text = text.replace("-", " ")
return text
@ -231,4 +230,4 @@ def main():
if __name__ == "__main__":
main()
main()

View File

@ -210,15 +210,15 @@ def main():
excluded = [
"<eps>",
"!sil",
"<spoken_noise>",
"!SIL",
"<SPOKEN_NOISE>",
args.oov,
"#0",
"<s>",
"</s>",
"[vocalized-noise]",
"[noise]",
"[laughter]",
"[VOCALIZED-NOISE]",
"[NOISE]",
"[LAUGHTER]",
]
for w in excluded:

View File

@ -46,11 +46,11 @@ cp local/MSU_single_letter.txt $dir/
# The original swbd lexicon does not have precise single letter lexicion
# e.g. it does not have entry of W
(
echo '!sil sil'
echo '[vocalized-noise] spn'
echo '[noise] nsn'
echo '[laughter] lau'
echo '<unk> spn'
echo '!SIL SIL'
echo '[VOCALIZED-NOISE] spn'
echo '[NOISE] nsn'
echo '[LAUGHTER] lau'
echo '<UNK> spn'
) |
cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1

View File

@ -43,9 +43,9 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19"
# It will generate data/lang_bpe_xxx,
# data/lang_bpe_yyy if the array contains xxx, yyy
vocab_sizes=(
5000
2000
1000
# 5000
# 2000
# 1000
500
)
@ -73,6 +73,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
./local/normalize_and_filter_supervisions.py \
data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz
mv data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/swbd/swbd_supervisions_orig.jsonl.gz
mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz
lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000
@ -149,8 +150,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
| jq '.text' | sed 's/"//g' > $lang_dir/text
fi
log "prepare dict"
./local/swbd1_prepare_dict.sh $swbd1_dir
log "Prepare dict"
./local/swbd1_prepare_dict.sh
cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt
# [noise] nsn
# !sil sil
@ -336,6 +337,10 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir
if [ ! -f $out_dir/train.txt ]; then
tail -n 250000 data/lang_phone/input.txt > $out_dir/train.txt
fi
./local/prepare_lm_training_data.py \
--bpe-model $lang_dir/bpe.model \
--lm-data data/lang_phone/input.txt \
@ -343,29 +348,29 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
done
fi
# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
# log "Stage 12: Generate LM validation data"
if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
log "Stage 12: Generate LM validation data"
# for vocab_size in ${vocab_sizes[@]}; do
# log "Processing vocab_size == ${vocab_size}"
# out_dir=data/lm_training_bpe_${vocab_size}
# mkdir -p $out_dir
for vocab_size in ${vocab_sizes[@]}; do
log "Processing vocab_size == ${vocab_size}"
out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir
# if [ ! -f $out_dir/valid.txt ]; then
# TODO: generate valid.txt
# fi
if [ ! -f $out_dir/valid.txt ]; then
head -n 14332 data/lang_phone/input.txt > $out_dir/valid.txt
fi
# lang_dir=data/lang_bpe_${vocab_size}
# ./local/prepare_lm_training_data.py \
# --bpe-model $lang_dir/bpe.model \
# --lm-data $out_dir/valid.txt \
# --lm-archive $out_dir/lm_data-valid.pt
# done
# fi
lang_dir=data/lang_bpe_${vocab_size}
./local/prepare_lm_training_data.py \
--bpe-model $lang_dir/bpe.model \
--lm-data $out_dir/valid.txt \
--lm-archive $out_dir/lm_data-valid.pt
done
fi
if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
log "Stage 13: Generate LM test data"
testsets=(eval2000 rt03)
testsets=(eval2000)
for testset in ${testsets[@]}; do
for vocab_size in ${vocab_sizes[@]}; do
@ -373,8 +378,9 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir
if [ ! -f $out_dir/test.txt ]; then
cat data/local/${testset}/text | cut -d " " -f 2- >$out_dir/${testset}.txt
if [ ! -f $out_dir/${testset}.txt ]; then
gunzip -c data/manifests/${testset}/eval2000_supervisions_all.jsonl.gz \
| jq '.text' | sed 's/"//g' > $out_dir/${testset}.txt
fi
lang_dir=data/lang_bpe_${vocab_size}
@ -388,7 +394,7 @@ fi
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
log "Stage 14: Sort LM training data"
testsets=(eval2000 rt03)
testsets=(eval2000)
# Sort LM training data by sentence length in descending order
# for ease of training.
#