diff --git a/egs/swbd/ASR/local/normalize_and_filter_supervisions.py b/egs/swbd/ASR/local/normalize_and_filter_supervisions.py index 9970e112a..20ab90caf 100755 --- a/egs/swbd/ASR/local/normalize_and_filter_supervisions.py +++ b/egs/swbd/ASR/local/normalize_and_filter_supervisions.py @@ -53,7 +53,6 @@ def insert_groups_and_capitalize_1s(match): return f"{match.group(1)}".upper() + "'s" -# fmt: off class FisherSwbdNormalizer: """Note: the functions "normalize" and "keep" implement the logic similar to Kaldi's data prep scripts for Fisher and SWBD: One @@ -62,18 +61,21 @@ class FisherSwbdNormalizer: normalization from Kaldi (hopefully won't make too much difference). """ - def __init__(self) -> None: + def __init__(self) -> None: self.remove_regexp_before = re.compile( - r"|".join([ - # special symbols - r"\[\[skip.*\]\]", - r"\[skip.*\]", - r"\[pause.*\]", - r"\[silence\]", - r"", - r"", - ]) + r"|".join( + [ + # special symbols + r"\[\[skip.*\]\]", + r"\[skip.*\]", + r"\[pause.*\]", + r"\[silence\]", + r"", + r"", + r"_1", + ] + ) ) # tuples of (pattern, replacement) @@ -102,14 +104,54 @@ class FisherSwbdNormalizer: (re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"), (re.compile(r"\[sneeze.*?\]"), r"[sneeze]"), # abbreviations - (re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3), - (re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2), - (re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1), - (re.compile(r"\._",), r" "), - (re.compile(r"_(\w)",), insert_groups_and_capitalize_1), - (re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s), - (re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s), - (re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1), + ( + re.compile( + r"(\w)\.(\w)\.(\w)", + ), + insert_groups_and_capitalize_3, + ), + ( + re.compile( + r"(\w)\.(\w)", + ), + insert_groups_and_capitalize_2, + ), + ( + re.compile( + r"([a-h,j-z])\.", + ), + insert_groups_and_capitalize_1, + ), + ( + re.compile( + r"\._", + ), + r" ", + ), + ( + re.compile( + r"_(\w)", + ), + insert_groups_and_capitalize_1, + ), + ( + re.compile( + r"(\w)\.s", + ), + insert_groups_and_capitalize_1s, + ), + ( + re.compile( + r"([A-Z])\'s", + ), + insert_groups_and_capitalize_1s, + ), + ( + re.compile( + r"(\s\w\b|^\w\b)", + ), + insert_groups_and_capitalize_1, + ), # words between apostrophes (re.compile(r"'(\S*?)'"), r"\1"), # dangling dashes (2 passes) @@ -119,25 +161,29 @@ class FisherSwbdNormalizer: (re.compile(r"(\[.*?\])-"), r"\1"), # Just remove all dashes (re.compile(r"-"), r" "), - - # Fix an issue related to [vocalized-noise] - (re.compile(r"\[vocalized noise\]"), r"\[vocalized-noise\]"), ] # unwanted symbols in the transcripts self.remove_regexp_after = re.compile( - r"|".join([ - # remaining punctuation - r"\.", - r",", - r"\?", - r"{", - r"}", - r"~", - r"_\d", - ]) + r"|".join( + [ + # remaining punctuation + r"\.", + r",", + r"\?", + r"{", + r"}", + r"~", + r"_\d", + ] + ) ) + self.post_fixes = [ + # Fix an issue related to [VOCALIZED NOISE] after dash removal + (re.compile(r"\[vocalized noise\]"), "[vocalized-noise]"), + ] + self.whitespace_regexp = re.compile(r"\s+") def normalize(self, text: str) -> str: @@ -153,11 +199,14 @@ class FisherSwbdNormalizer: # then remove text = self.remove_regexp_after.sub("", text) + # post fixes + for pattern, sub in self.post_fixes: + text = pattern.sub(sub, text) + # then clean up whitespace text = self.whitespace_regexp.sub(" ", text).strip() return text.upper() -# fmt: on def keep(sup: SupervisionSegment) -> bool: @@ -186,7 +235,7 @@ def main(): skip += 1 continue - sup.text = normalizer.normalize(sup.text) + sup.text = normalizer.normalize(sup.text).upper() if not sup.text: skip += 1 continue @@ -219,8 +268,9 @@ def test(): "[VOCALIZED-NOISE]-", "~BULL", "Frank E Peretti P E R E T T I", - "yeah yeah like Double O Seven he’s supposed to do it", + "yeah yeah like Double O Seven he's supposed to do it", "P A P E R paper", + "[noise] okay_1 um let me see [laughter] i've been sitting here awhile", ]: print(text) print(normalizer.normalize(text)) @@ -228,5 +278,6 @@ def test(): if __name__ == "__main__": - # test(); exit() + test() + # exit() main() diff --git a/egs/swbd/ASR/local/normalize_eval2000.py b/egs/swbd/ASR/local/normalize_eval2000.py index 2e4a3c07f..38451f506 100755 --- a/egs/swbd/ASR/local/normalize_eval2000.py +++ b/egs/swbd/ASR/local/normalize_eval2000.py @@ -83,9 +83,7 @@ def replace_silphone(text: str) -> str: text = text.replace("[[HIGH PITCHED SQUEAKY VOICE]]", " ") text = text.replace("[[IN THE LAUGH]]", "[LAUGHTER]") text = text.replace("[[LAST WORD SPOKEN WITH A LAUGH]]", "[LAUGHTER]") - text = text.replace( - "[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " " - ) + text = text.replace("[[PART OF FIRST SYLLABLE OF PREVIOUS WORD CUT OFF]]", " ") text = text.replace("[[PREVIOUS WORD SPOKEN WITH A LAUGH]]", " ") text = text.replace("[[PREVIOUS TWO WORDS SPOKEN WHILE LAUGHING]]", " ") text = text.replace("[[PROLONGED]]", " ") @@ -181,6 +179,7 @@ def replace_silphone(text: str) -> str: text = text.replace("[LAUGHTER]", " ") text = text.replace("[NOISE]", " ") text = text.replace("[VOCALIZED-NOISE]", " ") + text = text.replace("-", " ") return text @@ -231,4 +230,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/egs/swbd/ASR/local/prepare_lang_bpe.py b/egs/swbd/ASR/local/prepare_lang_bpe.py index db5e42b05..0fd937bc9 100755 --- a/egs/swbd/ASR/local/prepare_lang_bpe.py +++ b/egs/swbd/ASR/local/prepare_lang_bpe.py @@ -210,15 +210,15 @@ def main(): excluded = [ "", - "!sil", - "", + "!SIL", + "", args.oov, "#0", "", "", - "[vocalized-noise]", - "[noise]", - "[laughter]", + "[VOCALIZED-NOISE]", + "[NOISE]", + "[LAUGHTER]", ] for w in excluded: diff --git a/egs/swbd/ASR/local/swbd1_prepare_dict.sh b/egs/swbd/ASR/local/swbd1_prepare_dict.sh index 0bb98903f..eff5fb5f1 100755 --- a/egs/swbd/ASR/local/swbd1_prepare_dict.sh +++ b/egs/swbd/ASR/local/swbd1_prepare_dict.sh @@ -46,11 +46,11 @@ cp local/MSU_single_letter.txt $dir/ # The original swbd lexicon does not have precise single letter lexicion # e.g. it does not have entry of W ( - echo '!sil sil' - echo '[vocalized-noise] spn' - echo '[noise] nsn' - echo '[laughter] lau' - echo ' spn' + echo '!SIL SIL' + echo '[VOCALIZED-NOISE] spn' + echo '[NOISE] nsn' + echo '[LAUGHTER] lau' + echo ' spn' ) | cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt >$dir/lexicon2.txt || exit 1 diff --git a/egs/swbd/ASR/prepare.sh b/egs/swbd/ASR/prepare.sh index 4609f329a..1ac5e8a1e 100755 --- a/egs/swbd/ASR/prepare.sh +++ b/egs/swbd/ASR/prepare.sh @@ -43,9 +43,9 @@ fisher_dir="/export/corpora3/LDC/LDC2004T19" # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( - 5000 - 2000 - 1000 + # 5000 + # 2000 + # 1000 500 ) @@ -73,6 +73,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then ./local/normalize_and_filter_supervisions.py \ data/manifests/swbd/swbd_supervisions_all.jsonl.gz \ data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz + mv data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/swbd/swbd_supervisions_orig.jsonl.gz mv data/manifests/swbd/swbd_supervisions_all_norm.jsonl.gz data/manifests/swbd/swbd_supervisions_all.jsonl.gz lhotse prepare eval2000 --absolute-paths 1 $eval2000_dir data/manifests/eval2000 @@ -149,8 +150,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then | jq '.text' | sed 's/"//g' > $lang_dir/text fi - log "prepare dict" - ./local/swbd1_prepare_dict.sh $swbd1_dir + log "Prepare dict" + ./local/swbd1_prepare_dict.sh cut -f 2- -d" " $lang_dir/text >${lang_dir}/input.txt # [noise] nsn # !sil sil @@ -336,6 +337,10 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then out_dir=data/lm_training_bpe_${vocab_size} mkdir -p $out_dir + if [ ! -f $out_dir/train.txt ]; then + tail -n 250000 data/lang_phone/input.txt > $out_dir/train.txt + fi + ./local/prepare_lm_training_data.py \ --bpe-model $lang_dir/bpe.model \ --lm-data data/lang_phone/input.txt \ @@ -343,29 +348,29 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then done fi -# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then -# log "Stage 12: Generate LM validation data" +if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then + log "Stage 12: Generate LM validation data" -# for vocab_size in ${vocab_sizes[@]}; do -# log "Processing vocab_size == ${vocab_size}" -# out_dir=data/lm_training_bpe_${vocab_size} -# mkdir -p $out_dir + for vocab_size in ${vocab_sizes[@]}; do + log "Processing vocab_size == ${vocab_size}" + out_dir=data/lm_training_bpe_${vocab_size} + mkdir -p $out_dir -# if [ ! -f $out_dir/valid.txt ]; then -# TODO: generate valid.txt -# fi + if [ ! -f $out_dir/valid.txt ]; then + head -n 14332 data/lang_phone/input.txt > $out_dir/valid.txt + fi -# lang_dir=data/lang_bpe_${vocab_size} -# ./local/prepare_lm_training_data.py \ -# --bpe-model $lang_dir/bpe.model \ -# --lm-data $out_dir/valid.txt \ -# --lm-archive $out_dir/lm_data-valid.pt -# done -# fi + lang_dir=data/lang_bpe_${vocab_size} + ./local/prepare_lm_training_data.py \ + --bpe-model $lang_dir/bpe.model \ + --lm-data $out_dir/valid.txt \ + --lm-archive $out_dir/lm_data-valid.pt + done +fi if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then log "Stage 13: Generate LM test data" - testsets=(eval2000 rt03) + testsets=(eval2000) for testset in ${testsets[@]}; do for vocab_size in ${vocab_sizes[@]}; do @@ -373,8 +378,9 @@ if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then out_dir=data/lm_training_bpe_${vocab_size} mkdir -p $out_dir - if [ ! -f $out_dir/test.txt ]; then - cat data/local/${testset}/text | cut -d " " -f 2- >$out_dir/${testset}.txt + if [ ! -f $out_dir/${testset}.txt ]; then + gunzip -c data/manifests/${testset}/eval2000_supervisions_all.jsonl.gz \ + | jq '.text' | sed 's/"//g' > $out_dir/${testset}.txt fi lang_dir=data/lang_bpe_${vocab_size} @@ -388,7 +394,7 @@ fi if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then log "Stage 14: Sort LM training data" - testsets=(eval2000 rt03) + testsets=(eval2000) # Sort LM training data by sentence length in descending order # for ease of training. #