mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-09 00:54:18 +00:00
minor updates
This commit is contained in:
parent
58d9088010
commit
ab07e58613
@ -53,7 +53,6 @@ def insert_groups_and_capitalize_1s(match):
|
|||||||
return f"{match.group(1)}".upper() + "'s"
|
return f"{match.group(1)}".upper() + "'s"
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
|
||||||
class FisherSwbdNormalizer:
|
class FisherSwbdNormalizer:
|
||||||
"""Note: the functions "normalize" and "keep" implement the logic
|
"""Note: the functions "normalize" and "keep" implement the logic
|
||||||
similar to Kaldi's data prep scripts for Fisher and SWBD: One
|
similar to Kaldi's data prep scripts for Fisher and SWBD: One
|
||||||
@ -62,18 +61,21 @@ class FisherSwbdNormalizer:
|
|||||||
normalization from Kaldi (hopefully won't make too much
|
normalization from Kaldi (hopefully won't make too much
|
||||||
difference).
|
difference).
|
||||||
"""
|
"""
|
||||||
def __init__(self) -> None:
|
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
self.remove_regexp_before = re.compile(
|
self.remove_regexp_before = re.compile(
|
||||||
r"|".join([
|
r"|".join(
|
||||||
# special symbols
|
[
|
||||||
r"\[\[skip.*\]\]",
|
# special symbols
|
||||||
r"\[skip.*\]",
|
r"\[\[skip.*\]\]",
|
||||||
r"\[pause.*\]",
|
r"\[skip.*\]",
|
||||||
r"\[silence\]",
|
r"\[pause.*\]",
|
||||||
r"<b_aside>",
|
r"\[silence\]",
|
||||||
r"<e_aside>",
|
r"<b_aside>",
|
||||||
])
|
r"<e_aside>",
|
||||||
|
r"_1",
|
||||||
|
]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# tuples of (pattern, replacement)
|
# tuples of (pattern, replacement)
|
||||||
@ -102,14 +104,54 @@ class FisherSwbdNormalizer:
|
|||||||
(re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
|
(re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
|
||||||
(re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
|
(re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
|
||||||
# abbreviations
|
# abbreviations
|
||||||
(re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
|
(
|
||||||
(re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
|
re.compile(
|
||||||
(re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
|
r"(\w)\.(\w)\.(\w)",
|
||||||
(re.compile(r"\._",), r" "),
|
),
|
||||||
(re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
|
insert_groups_and_capitalize_3,
|
||||||
(re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
|
),
|
||||||
(re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
|
(
|
||||||
(re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
|
re.compile(
|
||||||
|
r"(\w)\.(\w)",
|
||||||
|
),
|
||||||
|
insert_groups_and_capitalize_2,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
re.compile(
|
||||||
|
r"([a-h,j-z])\.",
|
||||||
|
),
|
||||||
|
insert_groups_and_capitalize_1,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
re.compile(
|
||||||
|
r"\._",
|
||||||
|
),
|
||||||
|
r" ",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
re.compile(
|
||||||
|
r"_(\w)",
|
||||||
|
),
|
||||||
|
insert_groups_and_capitalize_1,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
re.compile(
|
||||||
|
r"(\w)\.s",
|
||||||
|
),
|
||||||
|
insert_groups_and_capitalize_1s,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
re.compile(
|
||||||
|
r"([A-Z])\'s",
|
||||||
|
),
|
||||||
|
insert_groups_and_capitalize_1s,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
re.compile(
|
||||||
|
r"(\s\w\b|^\w\b)",
|
||||||
|
),
|
||||||
|
insert_groups_and_capitalize_1,
|
||||||
|
),
|
||||||
# words between apostrophes
|
# words between apostrophes
|
||||||
(re.compile(r"'(\S*?)'"), r"\1"),
|
(re.compile(r"'(\S*?)'"), r"\1"),
|
||||||
# dangling dashes (2 passes)
|
# dangling dashes (2 passes)
|
||||||
@ -119,25 +161,29 @@ class FisherSwbdNormalizer:
|
|||||||
(re.compile(r"(\[.*?\])-"), r"\1"),
|
(re.compile(r"(\[.*?\])-"), r"\1"),
|
||||||
# Just remove all dashes
|
# Just remove all dashes
|
||||||
(re.compile(r"-"), r" "),
|
(re.compile(r"-"), r" "),
|
||||||
|
|
||||||
# Fix an issue related to [vocalized-noise]
|
|
||||||
(re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# unwanted symbols in the transcripts
|
# unwanted symbols in the transcripts
|
||||||
self.remove_regexp_after = re.compile(
|
self.remove_regexp_after = re.compile(
|
||||||
r"|".join([
|
r"|".join(
|
||||||
# remaining punctuation
|
[
|
||||||
r"\.",
|
# remaining punctuation
|
||||||
r",",
|
r"\.",
|
||||||
r"\?",
|
r",",
|
||||||
r"{",
|
r"\?",
|
||||||
r"}",
|
r"{",
|
||||||
r"~",
|
r"}",
|
||||||
r"_\d",
|
r"~",
|
||||||
])
|
r"_\d",
|
||||||
|
]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.post_fixes = [
|
||||||
|
# Fix an issue related to [VOCALIZED NOISE] after dash removal
|
||||||
|
(re.compile(r"\[vocalized noise\]"), "[vocalized-noise]"),
|
||||||
|
]
|
||||||
|
|
||||||
self.whitespace_regexp = re.compile(r"\s+")
|
self.whitespace_regexp = re.compile(r"\s+")
|
||||||
|
|
||||||
def normalize(self, text: str) -> str:
|
def normalize(self, text: str) -> str:
|
||||||
@ -153,11 +199,14 @@ class FisherSwbdNormalizer:
|
|||||||
# then remove
|
# then remove
|
||||||
text = self.remove_regexp_after.sub("", text)
|
text = self.remove_regexp_after.sub("", text)
|
||||||
|
|
||||||
|
# post fixes
|
||||||
|
for pattern, sub in self.post_fixes:
|
||||||
|
text = pattern.sub(sub, text)
|
||||||
|
|
||||||
# then clean up whitespace
|
# then clean up whitespace
|
||||||
text = self.whitespace_regexp.sub(" ", text).strip()
|
text = self.whitespace_regexp.sub(" ", text).strip()
|
||||||
|
|
||||||
return text.upper()
|
return text.upper()
|
||||||
# fmt: on
|
|
||||||
|
|
||||||
|
|
||||||
def keep(sup: SupervisionSegment) -> bool:
|
def keep(sup: SupervisionSegment) -> bool:
|
||||||
@ -186,7 +235,7 @@ def main():
|
|||||||
skip += 1
|
skip += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sup.text = normalizer.normalize(sup.text)
|
sup.text = normalizer.normalize(sup.text).upper()
|
||||||
if not sup.text:
|
if not sup.text:
|
||||||
skip += 1
|
skip += 1
|
||||||
continue
|
continue
|
||||||
@ -219,8 +268,9 @@ def test():
|
|||||||
"[VOCALIZED-NOISE]-",
|
"[VOCALIZED-NOISE]-",
|
||||||
"~BULL",
|
"~BULL",
|
||||||
"Frank E Peretti P E R E T T I",
|
"Frank E Peretti P E R E T T I",
|
||||||
"yeah yeah like Double O Seven he’s supposed to do it",
|
"yeah yeah like Double O Seven he's supposed to do it",
|
||||||
"P A P E R paper",
|
"P A P E R paper",
|
||||||
|
"[noise] okay_1 um let me see [laughter] i've been sitting here awhile",
|
||||||
]:
|
]:
|
||||||
print(text)
|
print(text)
|
||||||
print(normalizer.normalize(text))
|
print(normalizer.normalize(text))
|
||||||
@ -228,5 +278,6 @@ def test():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# test(); exit()
|
test()
|
||||||
|
# exit()
|
||||||
main()
|
main()
|
||||||
|
@ -83,9 +83,7 @@ def replace_silphone(text: str) -> str:
|
|||||||
text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ")
|
text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ")
|
||||||
text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]")
|
text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]")
|
||||||
text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]")
|
text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]")
|
||||||
text = text.replace(
|
text = text.replace("[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " ")
|
||||||
"[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " "
|
|
||||||
)
|
|
||||||
text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ")
|
text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ")
|
||||||
text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ")
|
text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ")
|
||||||
text = text.replace("[[PROLONGED]]", " ")
|
text = text.replace("[[PROLONGED]]", " ")
|
||||||
@ -181,6 +179,7 @@ def replace_silphone(text: str) -> str:
|
|||||||
text = text.replace("[LAUGHTER]", " ")
|
text = text.replace("[LAUGHTER]", " ")
|
||||||
text = text.replace("[NOISE]", " ")
|
text = text.replace("[NOISE]", " ")
|
||||||
text = text.replace("[VOCALIZED-NOISE]", " ")
|
text = text.replace("[VOCALIZED-NOISE]", " ")
|
||||||
|
text = text.replace("-", " ")
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@ -231,4 +230,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@ -210,15 +210,15 @@ def main():
|
|||||||
|
|
||||||
excluded = [
|
excluded = [
|
||||||
"<eps>",
|
"<eps>",
|
||||||
"!sil",
|
"!SIL",
|
||||||
"<spoken_noise>",
|
"<SPOKEN_NOISE>",
|
||||||
args.oov,
|
args.oov,
|
||||||
"#0",
|
"#0",
|
||||||
"<s>",
|
"<s>",
|
||||||
"</s>",
|
"</s>",
|
||||||
"[vocalized-noise]",
|
"[VOCALIZED-NOISE]",
|
||||||
"[noise]",
|
"[NOISE]",
|
||||||
"[laughter]",
|
"[LAUGHTER]",
|
||||||
]
|
]
|
||||||
|
|
||||||
for w in excluded:
|
for w in excluded:
|
||||||
|
@ -46,11 +46,11 @@ cp local/MSU_single_letter.txt $dir/
|
|||||||
# The original swbd lexicon does not have precise single letter lexicion
|
# The original swbd lexicon does not have precise single letter lexicion
|
||||||
# e.g. it does not have entry of W
|
# e.g. it does not have entry of W
|
||||||
(
|
(
|
||||||
echo '!sil sil'
|
echo '!SIL SIL'
|
||||||
echo '[vocalized-noise] spn'
|
echo '[VOCALIZED-NOISE] spn'
|
||||||
echo '[noise] nsn'
|
echo '[NOISE] nsn'
|
||||||
echo '[laughter] lau'
|
echo '[LAUGHTER] lau'
|
||||||
echo '<unk> spn'
|
echo '<UNK> spn'
|
||||||
) |
|
) |
|
||||||
cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1
|
cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1
|
||||||
|
|
||||||
|
@ -43,9 +43,9 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19"
|
|||||||
# It will generate data/lang_bpe_xxx,
|
# It will generate data/lang_bpe_xxx,
|
||||||
# data/lang_bpe_yyy if the array contains xxx, yyy
|
# data/lang_bpe_yyy if the array contains xxx, yyy
|
||||||
vocab_sizes=(
|
vocab_sizes=(
|
||||||
5000
|
# 5000
|
||||||
2000
|
# 2000
|
||||||
1000
|
# 1000
|
||||||
500
|
500
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -73,6 +73,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
./local/normalize_and_filter_supervisions.py \
|
./local/normalize_and_filter_supervisions.py \
|
||||||
data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
|
data/manifests/swbd/swbd_supervisions_all.jsonl.gz \
|
||||||
data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz
|
data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz
|
||||||
|
mv data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/swbd/swbd_supervisions_orig.jsonl.gz
|
||||||
mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz
|
mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz
|
||||||
|
|
||||||
lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000
|
lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000
|
||||||
@ -149,8 +150,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
| jq '.text' | sed 's/"//g' > $lang_dir/text
|
| jq '.text' | sed 's/"//g' > $lang_dir/text
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "prepare dict"
|
log "Prepare dict"
|
||||||
./local/swbd1_prepare_dict.sh $swbd1_dir
|
./local/swbd1_prepare_dict.sh
|
||||||
cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt
|
cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt
|
||||||
# [noise] nsn
|
# [noise] nsn
|
||||||
# !sil sil
|
# !sil sil
|
||||||
@ -336,6 +337,10 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
|||||||
out_dir=data/lm_training_bpe_${vocab_size}
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
mkdir -p $out_dir
|
mkdir -p $out_dir
|
||||||
|
|
||||||
|
if [ ! -f $out_dir/train.txt ]; then
|
||||||
|
tail -n 250000 data/lang_phone/input.txt > $out_dir/train.txt
|
||||||
|
fi
|
||||||
|
|
||||||
./local/prepare_lm_training_data.py \
|
./local/prepare_lm_training_data.py \
|
||||||
--bpe-model $lang_dir/bpe.model \
|
--bpe-model $lang_dir/bpe.model \
|
||||||
--lm-data data/lang_phone/input.txt \
|
--lm-data data/lang_phone/input.txt \
|
||||||
@ -343,29 +348,29 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
|||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
|
if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
|
||||||
# log "Stage 12: Generate LM validation data"
|
log "Stage 12: Generate LM validation data"
|
||||||
|
|
||||||
# for vocab_size in ${vocab_sizes[@]}; do
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
# log "Processing vocab_size == ${vocab_size}"
|
log "Processing vocab_size == ${vocab_size}"
|
||||||
# out_dir=data/lm_training_bpe_${vocab_size}
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
# mkdir -p $out_dir
|
mkdir -p $out_dir
|
||||||
|
|
||||||
# if [ ! -f $out_dir/valid.txt ]; then
|
if [ ! -f $out_dir/valid.txt ]; then
|
||||||
# TODO: generate valid.txt
|
head -n 14332 data/lang_phone/input.txt > $out_dir/valid.txt
|
||||||
# fi
|
fi
|
||||||
|
|
||||||
# lang_dir=data/lang_bpe_${vocab_size}
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
# ./local/prepare_lm_training_data.py \
|
./local/prepare_lm_training_data.py \
|
||||||
# --bpe-model $lang_dir/bpe.model \
|
--bpe-model $lang_dir/bpe.model \
|
||||||
# --lm-data $out_dir/valid.txt \
|
--lm-data $out_dir/valid.txt \
|
||||||
# --lm-archive $out_dir/lm_data-valid.pt
|
--lm-archive $out_dir/lm_data-valid.pt
|
||||||
# done
|
done
|
||||||
# fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
||||||
log "Stage 13: Generate LM test data"
|
log "Stage 13: Generate LM test data"
|
||||||
testsets=(eval2000 rt03)
|
testsets=(eval2000)
|
||||||
|
|
||||||
for testset in ${testsets[@]}; do
|
for testset in ${testsets[@]}; do
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
@ -373,8 +378,9 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
|
|||||||
out_dir=data/lm_training_bpe_${vocab_size}
|
out_dir=data/lm_training_bpe_${vocab_size}
|
||||||
mkdir -p $out_dir
|
mkdir -p $out_dir
|
||||||
|
|
||||||
if [ ! -f $out_dir/test.txt ]; then
|
if [ ! -f $out_dir/${testset}.txt ]; then
|
||||||
cat data/local/${testset}/text | cut -d " " -f 2- >$out_dir/${testset}.txt
|
gunzip -c data/manifests/${testset}/eval2000_supervisions_all.jsonl.gz \
|
||||||
|
| jq '.text' | sed 's/"//g' > $out_dir/${testset}.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
lang_dir=data/lang_bpe_${vocab_size}
|
lang_dir=data/lang_bpe_${vocab_size}
|
||||||
@ -388,7 +394,7 @@ fi
|
|||||||
|
|
||||||
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
|
if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
|
||||||
log "Stage 14: Sort LM training data"
|
log "Stage 14: Sort LM training data"
|
||||||
testsets=(eval2000 rt03)
|
testsets=(eval2000)
|
||||||
# Sort LM training data by sentence length in descending order
|
# Sort LM training data by sentence length in descending order
|
||||||
# for ease of training.
|
# for ease of training.
|
||||||
#
|
#
|
||||||
|
Loading…
x
Reference in New Issue
Block a user