minor updates

This commit is contained in:
JinZr 2023-08-18 11:19:08 +08:00
parent 58d9088010
commit ab07e58613
5 changed files with 131 additions and 75 deletions

View File

@ -53,7 +53,6 @@ def insert_groups_and_capitalize_1s(match):
return f"{match.group(1)}".upper() + "'s" return f"{match.group(1)}".upper() + "'s"
# fmt: off
class FisherSwbdNormalizer: class FisherSwbdNormalizer:
"""Note: the functions "normalize" and "keep" implement the logic """Note: the functions "normalize" and "keep" implement the logic
similar to Kaldi's data prep scripts for Fisher and SWBD: One similar to Kaldi's data prep scripts for Fisher and SWBD: One
@ -62,10 +61,11 @@ class FisherSwbdNormalizer:
normalization from Kaldi (hopefully won't make too much normalization from Kaldi (hopefully won't make too much
difference). difference).
""" """
def __init__(self) -> None:
def __init__(self) -> None:
self.remove_regexp_before = re.compile( self.remove_regexp_before = re.compile(
r"|".join([ r"|".join(
[
# special symbols # special symbols
r"\[\[skip.*\]\]", r"\[\[skip.*\]\]",
r"\[skip.*\]", r"\[skip.*\]",
@ -73,7 +73,9 @@ class FisherSwbdNormalizer:
r"\[silence\]", r"\[silence\]",
r"<b_aside>", r"<b_aside>",
r"<e_aside>", r"<e_aside>",
]) r"_1",
]
)
) )
# tuples of (pattern, replacement) # tuples of (pattern, replacement)
@ -102,14 +104,54 @@ class FisherSwbdNormalizer:
(re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"), (re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
(re.compile(r"\[sneeze.*?\]"), r"[sneeze]"), (re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
# abbreviations # abbreviations
(re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3), (
(re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2), re.compile(
(re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1), r"(\w)\.(\w)\.(\w)",
(re.compile(r"\._",), r" "), ),
(re.compile(r"_(\w)",), insert_groups_and_capitalize_1), insert_groups_and_capitalize_3,
(re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s), ),
(re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s), (
(re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1), re.compile(
r"(\w)\.(\w)",
),
insert_groups_and_capitalize_2,
),
(
re.compile(
r"([a-h,j-z])\.",
),
insert_groups_and_capitalize_1,
),
(
re.compile(
r"\._",
),
r" ",
),
(
re.compile(
r"_(\w)",
),
insert_groups_and_capitalize_1,
),
(
re.compile(
r"(\w)\.s",
),
insert_groups_and_capitalize_1s,
),
(
re.compile(
r"([A-Z])\'s",
),
insert_groups_and_capitalize_1s,
),
(
re.compile(
r"(\s\w\b|^\w\b)",
),
insert_groups_and_capitalize_1,
),
# words between apostrophes # words between apostrophes
(re.compile(r"'(\S*?)'"), r"\1"), (re.compile(r"'(\S*?)'"), r"\1"),
# dangling dashes (2 passes) # dangling dashes (2 passes)
@ -119,14 +161,12 @@ class FisherSwbdNormalizer:
(re.compile(r"(\[.*?\])-"), r"\1"), (re.compile(r"(\[.*?\])-"), r"\1"),
# Just remove all dashes # Just remove all dashes
(re.compile(r"-"), r" "), (re.compile(r"-"), r" "),
# Fix an issue related to [vocalized-noise]
(re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"),
] ]
# unwanted symbols in the transcripts # unwanted symbols in the transcripts
self.remove_regexp_after = re.compile( self.remove_regexp_after = re.compile(
r"|".join([ r"|".join(
[
# remaining punctuation # remaining punctuation
r"\.", r"\.",
r",", r",",
@ -135,8 +175,14 @@ class FisherSwbdNormalizer:
r"}", r"}",
r"~", r"~",
r"_\d", r"_\d",
]) ]
) )
)
self.post_fixes = [
# Fix an issue related to [VOCALIZED NOISE] after dash removal
(re.compile(r"\[vocalized noise\]"), "[vocalized-noise]"),
]
self.whitespace_regexp = re.compile(r"\s+") self.whitespace_regexp = re.compile(r"\s+")
@ -153,11 +199,14 @@ class FisherSwbdNormalizer:
# then remove # then remove
text = self.remove_regexp_after.sub("", text) text = self.remove_regexp_after.sub("", text)
# post fixes
for pattern, sub in self.post_fixes:
text = pattern.sub(sub, text)
# then clean up whitespace # then clean up whitespace
text = self.whitespace_regexp.sub(" ", text).strip() text = self.whitespace_regexp.sub(" ", text).strip()
return text.upper() return text.upper()
# fmt: on
def keep(sup: SupervisionSegment) -> bool: def keep(sup: SupervisionSegment) -> bool:
@ -186,7 +235,7 @@ def main():
skip += 1 skip += 1
continue continue
sup.text = normalizer.normalize(sup.text) sup.text = normalizer.normalize(sup.text).upper()
if not sup.text: if not sup.text:
skip += 1 skip += 1
continue continue
@ -219,8 +268,9 @@ def test():
"[VOCALIZED-NOISE]-", "[VOCALIZED-NOISE]-",
"~BULL", "~BULL",
"Frank E Peretti P E R E T T I", "Frank E Peretti P E R E T T I",
"yeah yeah like Double O Seven hes supposed to do it", "yeah yeah like Double O Seven he's supposed to do it",
"P A P E R paper", "P A P E R paper",
"[noise] okay_1 um let me see [laughter] i've been sitting here awhile",
]: ]:
print(text) print(text)
print(normalizer.normalize(text)) print(normalizer.normalize(text))
@ -228,5 +278,6 @@ def test():
if __name__ == "__main__": if __name__ == "__main__":
# test(); exit() test()
# exit()
main() main()

View File

@ -83,9 +83,7 @@ def replace_silphone(text: str) -> str:
text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ") text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ")
text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]") text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]")
text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]") text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]")
text = text.replace( text = text.replace("[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " ")
"[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " "
)
text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ") text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ")
text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ") text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ")
text = text.replace("[[PROLONGED]]", " ") text = text.replace("[[PROLONGED]]", " ")
@ -181,6 +179,7 @@ def replace_silphone(text: str) -> str:
text = text.replace("[LAUGHTER]", " ") text = text.replace("[LAUGHTER]", " ")
text = text.replace("[NOISE]", " ") text = text.replace("[NOISE]", " ")
text = text.replace("[VOCALIZED-NOISE]", " ") text = text.replace("[VOCALIZED-NOISE]", " ")
text = text.replace("-", " ")
return text return text

View File

@ -210,15 +210,15 @@ def main():
excluded = [ excluded = [
"<eps>", "<eps>",
"!sil", "!SIL",
"<spoken_noise>", "<SPOKEN_NOISE>",
args.oov, args.oov,
"#0", "#0",
"<s>", "<s>",
"</s>", "</s>",
"[vocalized-noise]", "[VOCALIZED-NOISE]",
"[noise]", "[NOISE]",
"[laughter]", "[LAUGHTER]",
] ]
for w in excluded: for w in excluded:

View File

@ -46,11 +46,11 @@ cp local/MSU_single_letter.txt $dir/
# The original swbd lexicon does not have precise single letter lexicion # The original swbd lexicon does not have precise single letter lexicion
# e.g. it does not have entry of W # e.g. it does not have entry of W
( (
echo '!sil sil' echo '!SIL SIL'
echo '[vocalized-noise] spn' echo '[VOCALIZED-NOISE] spn'
echo '[noise] nsn' echo '[NOISE] nsn'
echo '[laughter] lau' echo '[LAUGHTER] lau'
echo '<unk> spn' echo '<UNK> spn'
) | ) |
cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1 cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1

View File

@ -43,9 +43,9 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19"
# It will generate data/lang_bpe_xxx, # It will generate data/lang_bpe_xxx,
# data/lang_bpe_yyy if the array contains xxx, yyy # data/lang_bpe_yyy if the array contains xxx, yyy
vocab_sizes=( vocab_sizes=(
5000 # 5000
2000 # 2000
1000 # 1000
500 500
) )
@ -73,6 +73,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
./local/normalize_and_filter_supervisions.py \ ./local/normalize_and_filter_supervisions.py \
data/manifests/swbd/swbd_supervisions_all.jsonl.gz \ data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz
mv data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/swbd/swbd_supervisions_orig.jsonl.gz
mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz
lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000 lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000
@ -149,8 +150,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
| jq '.text' | sed 's/"//g' > $lang_dir/text | jq '.text' | sed 's/"//g' > $lang_dir/text
fi fi
log "prepare dict" log "Prepare dict"
./local/swbd1_prepare_dict.sh $swbd1_dir ./local/swbd1_prepare_dict.sh
cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt
# [noise] nsn # [noise] nsn
# !sil sil # !sil sil
@ -336,6 +337,10 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
out_dir=data/lm_training_bpe_${vocab_size} out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir mkdir -p $out_dir
if [ ! -f $out_dir/train.txt ]; then
tail -n 250000 data/lang_phone/input.txt > $out_dir/train.txt
fi
./local/prepare_lm_training_data.py \ ./local/prepare_lm_training_data.py \
--bpe-model $lang_dir/bpe.model \ --bpe-model $lang_dir/bpe.model \
--lm-data data/lang_phone/input.txt \ --lm-data data/lang_phone/input.txt \
@ -343,29 +348,29 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
done done
fi fi
# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
# log "Stage 12: Generate LM validation data" log "Stage 12: Generate LM validation data"
# for vocab_size in ${vocab_sizes[@]}; do for vocab_size in ${vocab_sizes[@]}; do
# log "Processing vocab_size == ${vocab_size}" log "Processing vocab_size == ${vocab_size}"
# out_dir=data/lm_training_bpe_${vocab_size} out_dir=data/lm_training_bpe_${vocab_size}
# mkdir -p $out_dir mkdir -p $out_dir
# if [ ! -f $out_dir/valid.txt ]; then if [ ! -f $out_dir/valid.txt ]; then
# TODO: generate valid.txt head -n 14332 data/lang_phone/input.txt > $out_dir/valid.txt
# fi fi
# lang_dir=data/lang_bpe_${vocab_size} lang_dir=data/lang_bpe_${vocab_size}
# ./local/prepare_lm_training_data.py \ ./local/prepare_lm_training_data.py \
# --bpe-model $lang_dir/bpe.model \ --bpe-model $lang_dir/bpe.model \
# --lm-data $out_dir/valid.txt \ --lm-data $out_dir/valid.txt \
# --lm-archive $out_dir/lm_data-valid.pt --lm-archive $out_dir/lm_data-valid.pt
# done done
# fi fi
if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
log "Stage 13: Generate LM test data" log "Stage 13: Generate LM test data"
testsets=(eval2000 rt03) testsets=(eval2000)
for testset in ${testsets[@]}; do for testset in ${testsets[@]}; do
for vocab_size in ${vocab_sizes[@]}; do for vocab_size in ${vocab_sizes[@]}; do
@ -373,8 +378,9 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
out_dir=data/lm_training_bpe_${vocab_size} out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir mkdir -p $out_dir
if [ ! -f $out_dir/test.txt ]; then if [ ! -f $out_dir/${testset}.txt ]; then
cat data/local/${testset}/text | cut -d " " -f 2- >$out_dir/${testset}.txt gunzip -c data/manifests/${testset}/eval2000_supervisions_all.jsonl.gz \
| jq '.text' | sed 's/"//g' > $out_dir/${testset}.txt
fi fi
lang_dir=data/lang_bpe_${vocab_size} lang_dir=data/lang_bpe_${vocab_size}
@ -388,7 +394,7 @@ fi
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
log "Stage 14: Sort LM training data" log "Stage 14: Sort LM training data"
testsets=(eval2000 rt03) testsets=(eval2000)
# Sort LM training data by sentence length in descending order # Sort LM training data by sentence length in descending order
# for ease of training. # for ease of training.
# #