mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-08 16:44:20 +00:00
minor updates
This commit is contained in:
parent
58d9088010
commit
ab07e58613
@ -53,7 +53,6 @@ def insert_groups_and_capitalize_1s(match):
|
||||
return f"{match.group(1)}".upper() + "'s"
|
||||
|
||||
|
||||
# fmt: off
|
||||
class FisherSwbdNormalizer:
|
||||
"""Note: the functions "normalize" and "keep" implement the logic
|
||||
similar to Kaldi's data prep scripts for Fisher and SWBD: One
|
||||
@ -62,18 +61,21 @@ class FisherSwbdNormalizer:
|
||||
normalization from Kaldi (hopefully won't make too much
|
||||
difference).
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.remove_regexp_before = re.compile(
|
||||
r"|".join([
|
||||
# special symbols
|
||||
r"\[\[skip.*\]\]",
|
||||
r"\[skip.*\]",
|
||||
r"\[pause.*\]",
|
||||
r"\[silence\]",
|
||||
r"<b_aside>",
|
||||
r"<e_aside>",
|
||||
])
|
||||
r"|".join(
|
||||
[
|
||||
# special symbols
|
||||
r"\[\[skip.*\]\]",
|
||||
r"\[skip.*\]",
|
||||
r"\[pause.*\]",
|
||||
r"\[silence\]",
|
||||
r"<b_aside>",
|
||||
r"<e_aside>",
|
||||
r"_1",
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
# tuples of (pattern, replacement)
|
||||
@ -102,14 +104,54 @@ class FisherSwbdNormalizer:
|
||||
(re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
|
||||
(re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
|
||||
# abbreviations
|
||||
(re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
|
||||
(re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
|
||||
(re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
|
||||
(re.compile(r"\._",), r" "),
|
||||
(re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
|
||||
(re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
|
||||
(re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
|
||||
(re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
|
||||
(
|
||||
re.compile(
|
||||
r"(\w)\.(\w)\.(\w)",
|
||||
),
|
||||
insert_groups_and_capitalize_3,
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"(\w)\.(\w)",
|
||||
),
|
||||
insert_groups_and_capitalize_2,
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"([a-h,j-z])\.",
|
||||
),
|
||||
insert_groups_and_capitalize_1,
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"\._",
|
||||
),
|
||||
r" ",
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"_(\w)",
|
||||
),
|
||||
insert_groups_and_capitalize_1,
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"(\w)\.s",
|
||||
),
|
||||
insert_groups_and_capitalize_1s,
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"([A-Z])\'s",
|
||||
),
|
||||
insert_groups_and_capitalize_1s,
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"(\s\w\b|^\w\b)",
|
||||
),
|
||||
insert_groups_and_capitalize_1,
|
||||
),
|
||||
# words between apostrophes
|
||||
(re.compile(r"'(\S*?)'"), r"\1"),
|
||||
# dangling dashes (2 passes)
|
||||
@ -119,25 +161,29 @@ class FisherSwbdNormalizer:
|
||||
(re.compile(r"(\[.*?\])-"), r"\1"),
|
||||
# Just remove all dashes
|
||||
(re.compile(r"-"), r" "),
|
||||
|
||||
# Fix an issue related to [vocalized-noise]
|
||||
(re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"),
|
||||
]
|
||||
|
||||
# unwanted symbols in the transcripts
|
||||
self.remove_regexp_after = re.compile(
|
||||
r"|".join([
|
||||
# remaining punctuation
|
||||
r"\.",
|
||||
r",",
|
||||
r"\?",
|
||||
r"{",
|
||||
r"}",
|
||||
r"~",
|
||||
r"_\d",
|
||||
])
|
||||
r"|".join(
|
||||
[
|
||||
# remaining punctuation
|
||||
r"\.",
|
||||
r",",
|
||||
r"\?",
|
||||
r"{",
|
||||
r"}",
|
||||
r"~",
|
||||
r"_\d",
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
self.post_fixes = [
|
||||
# Fix an issue related to [VOCALIZED NOISE] after dash removal
|
||||
(re.compile(r"\[vocalized noise\]"), "[vocalized-noise]"),
|
||||
]
|
||||
|
||||
self.whitespace_regexp = re.compile(r"\s+")
|
||||
|
||||
def normalize(self, text: str) -> str:
|
||||
@ -153,11 +199,14 @@ class FisherSwbdNormalizer:
|
||||
# then remove
|
||||
text = self.remove_regexp_after.sub("", text)
|
||||
|
||||
# post fixes
|
||||
for pattern, sub in self.post_fixes:
|
||||
text = pattern.sub(sub, text)
|
||||
|
||||
# then clean up whitespace
|
||||
text = self.whitespace_regexp.sub(" ", text).strip()
|
||||
|
||||
return text.upper()
|
||||
# fmt: on
|
||||
|
||||
|
||||
def keep(sup: SupervisionSegment) -> bool:
|
||||
@ -186,7 +235,7 @@ def main():
|
||||
skip += 1
|
||||
continue
|
||||
|
||||
sup.text = normalizer.normalize(sup.text)
|
||||
sup.text = normalizer.normalize(sup.text).upper()
|
||||
if not sup.text:
|
||||
skip += 1
|
||||
continue
|
||||
@ -219,8 +268,9 @@ def test():
|
||||
"[VOCALIZED-NOISE]-",
|
||||
"~BULL",
|
||||
"Frank E Peretti P E R E T T I",
|
||||
"yeah yeah like Double O Seven he’s supposed to do it",
|
||||
"yeah yeah like Double O Seven he's supposed to do it",
|
||||
"P A P E R paper",
|
||||
"[noise] okay_1 um let me see [laughter] i've been sitting here awhile",
|
||||
]:
|
||||
print(text)
|
||||
print(normalizer.normalize(text))
|
||||
@ -228,5 +278,6 @@ def test():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test(); exit()
|
||||
test()
|
||||
# exit()
|
||||
main()
|
||||
|
@ -83,9 +83,7 @@ def replace_silphone(text: str) -> str:
|
||||
text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ")
|
||||
text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]")
|
||||
text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]")
|
||||
text = text.replace(
|
||||
"[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " "
|
||||
)
|
||||
text = text.replace("[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " ")
|
||||
text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ")
|
||||
text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ")
|
||||
text = text.replace("[[PROLONGED]]", " ")
|
||||
@ -181,6 +179,7 @@ def replace_silphone(text: str) -> str:
|
||||
text = text.replace("[LAUGHTER]", " ")
|
||||
text = text.replace("[NOISE]", " ")
|
||||
text = text.replace("[VOCALIZED-NOISE]", " ")
|
||||
text = text.replace("-", " ")
|
||||
return text
|
||||
|
||||
|
||||
@ -231,4 +230,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
@ -210,15 +210,15 @@ def main():
|
||||
|
||||
excluded = [
|
||||
"<eps>",
|
||||
"!sil",
|
||||
"<spoken_noise>",
|
||||
"!SIL",
|
||||
"<SPOKEN_NOISE>",
|
||||
args.oov,
|
||||
"#0",
|
||||
"<s>",
|
||||
"</s>",
|
||||
"[vocalized-noise]",
|
||||
"[noise]",
|
||||
"[laughter]",
|
||||
"[VOCALIZED-NOISE]",
|
||||
"[NOISE]",
|
||||
"[LAUGHTER]",
|
||||
]
|
||||
|
||||
for w in excluded:
|
||||
|
@ -46,11 +46,11 @@ cp local/MSU_single_letter.txt $dir/
|
||||
# The original swbd lexicon does not have precise single letter lexicion
|
||||
# e.g. it does not have entry of W
|
||||
(
|
||||
echo '!sil sil'
|
||||
echo '[vocalized-noise] spn'
|
||||
echo '[noise] nsn'
|
||||
echo '[laughter] lau'
|
||||
echo '<unk> spn'
|
||||
echo '!SIL SIL'
|
||||
echo '[VOCALIZED-NOISE] spn'
|
||||
echo '[NOISE] nsn'
|
||||
echo '[LAUGHTER] lau'
|
||||
echo '<UNK> spn'
|
||||
) |
|
||||
cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1
|
||||
|
||||
|
@ -43,9 +43,9 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19"
|
||||
# It will generate data/lang_bpe_xxx,
|
||||
# data/lang_bpe_yyy if the array contains xxx, yyy
|
||||
vocab_sizes=(
|
||||
5000
|
||||
2000
|
||||
1000
|
||||
# 5000
|
||||
# 2000
|
||||
# 1000
|
||||
500
|
||||
)
|
||||
|
||||
@ -73,6 +73,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
./local/normalize_and_filter_supervisions.py \
|
||||
data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
|
||||
data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz
|
||||
mv data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/swbd/swbd_supervisions_orig.jsonl.gz
|
||||
mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz
|
||||
|
||||
lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000
|
||||
@ -149,8 +150,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
| jq '.text' | sed 's/"//g' > $lang_dir/text
|
||||
fi
|
||||
|
||||
log "prepare dict"
|
||||
./local/swbd1_prepare_dict.sh $swbd1_dir
|
||||
log "Prepare dict"
|
||||
./local/swbd1_prepare_dict.sh
|
||||
cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt
|
||||
# [noise] nsn
|
||||
# !sil sil
|
||||
@ -336,6 +337,10 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
||||
out_dir=data/lm_training_bpe_${vocab_size}
|
||||
mkdir -p $out_dir
|
||||
|
||||
if [ ! -f $out_dir/train.txt ]; then
|
||||
tail -n 250000 data/lang_phone/input.txt > $out_dir/train.txt
|
||||
fi
|
||||
|
||||
./local/prepare_lm_training_data.py \
|
||||
--bpe-model $lang_dir/bpe.model \
|
||||
--lm-data data/lang_phone/input.txt \
|
||||
@ -343,29 +348,29 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
||||
done
|
||||
fi
|
||||
|
||||
# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
|
||||
# log "Stage 12: Generate LM validation data"
|
||||
if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
|
||||
log "Stage 12: Generate LM validation data"
|
||||
|
||||
# for vocab_size in ${vocab_sizes[@]}; do
|
||||
# log "Processing vocab_size == ${vocab_size}"
|
||||
# out_dir=data/lm_training_bpe_${vocab_size}
|
||||
# mkdir -p $out_dir
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
log "Processing vocab_size == ${vocab_size}"
|
||||
out_dir=data/lm_training_bpe_${vocab_size}
|
||||
mkdir -p $out_dir
|
||||
|
||||
# if [ ! -f $out_dir/valid.txt ]; then
|
||||
# TODO: generate valid.txt
|
||||
# fi
|
||||
if [ ! -f $out_dir/valid.txt ]; then
|
||||
head -n 14332 data/lang_phone/input.txt > $out_dir/valid.txt
|
||||
fi
|
||||
|
||||
# lang_dir=data/lang_bpe_${vocab_size}
|
||||
# ./local/prepare_lm_training_data.py \
|
||||
# --bpe-model $lang_dir/bpe.model \
|
||||
# --lm-data $out_dir/valid.txt \
|
||||
# --lm-archive $out_dir/lm_data-valid.pt
|
||||
# done
|
||||
# fi
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
./local/prepare_lm_training_data.py \
|
||||
--bpe-model $lang_dir/bpe.model \
|
||||
--lm-data $out_dir/valid.txt \
|
||||
--lm-archive $out_dir/lm_data-valid.pt
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
||||
log "Stage 13: Generate LM test data"
|
||||
testsets=(eval2000 rt03)
|
||||
testsets=(eval2000)
|
||||
|
||||
for testset in ${testsets[@]}; do
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
@ -373,8 +378,9 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
||||
out_dir=data/lm_training_bpe_${vocab_size}
|
||||
mkdir -p $out_dir
|
||||
|
||||
if [ ! -f $out_dir/test.txt ]; then
|
||||
cat data/local/${testset}/text | cut -d " " -f 2- >$out_dir/${testset}.txt
|
||||
if [ ! -f $out_dir/${testset}.txt ]; then
|
||||
gunzip -c data/manifests/${testset}/eval2000_supervisions_all.jsonl.gz \
|
||||
| jq '.text' | sed 's/"//g' > $out_dir/${testset}.txt
|
||||
fi
|
||||
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
@ -388,7 +394,7 @@ fi
|
||||
|
||||
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
|
||||
log "Stage 14: Sort LM training data"
|
||||
testsets=(eval2000 rt03)
|
||||
testsets=(eval2000)
|
||||
# Sort LM training data by sentence length in descending order
|
||||
# for ease of training.
|
||||
#
|
||||
|
Loading…
x
Reference in New Issue
Block a user