From a7666d864c96b9c3bdf1a980535232b8f2db088c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Sat, 15 Jan 2022 01:27:31 +0000 Subject: [PATCH] Missing steps in prepare.sh --- .../normalize_and_filter_supervisions.py | 51 +++++++++++------ egs/fisher_swbd/ASR/prepare.sh | 55 ++++++++++++------- 2 files changed, 68 insertions(+), 38 deletions(-) diff --git a/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py b/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py index be9715578..9933d3a4f 100644 --- a/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py +++ b/egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py @@ -17,15 +17,34 @@ def get_args(): return parser.parse_args() -# Note: the functions "normalize" and "keep" implement the logic similar to -# Kaldi's data prep scripts for Fisher: -# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh -# and for SWBD: -# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh +class FisherSwbdNormalizer: + """ + Note: the functions "normalize" and "keep" implement the logic similar to + Kaldi's data prep scripts for Fisher: + https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh + and for SWBD: + https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh + + One notable difference is that we don't change [cough], [lipsmack], etc. to [noise]. + We also don't implement all the edge cases of normalization from Kaldi + (hopefully won't make too much difference). + """ -class Normalizer: def __init__(self) -> None: + + self.remove_regexp_before = re.compile( + r"|".join([ + # special symbols + r"\[\[SKIP.*\]\]", + r"\[SKIP.*\]", + r"\[PAUSE.*\]", + r"\[SILENCE\]", + r"", + r"", + ]) + ) + # tuples of (pattern, replacement) # note: Kaldi replaces sighs, coughs, etc with [noise]. # We don't do that here. @@ -63,19 +82,12 @@ class Normalizer: (re.compile(r"\s-\s"), r" "), (re.compile(r"\s-\s"), r" "), # special symbol with trailing dash - (re.compile(r"(\[\w+\])-"), r"\1"), + (re.compile(r"(\[.*?\])-"), r"\1"), ] # unwanted symbols in the transcripts - self.remove_regexp = re.compile( + self.remove_regexp_after = re.compile( r"|".join([ - # special symbols - r"\[\[SKIP.*\]\]", - r"\[SKIP.*\]", - r"\[PAUSE.*\]", - r"\[SILENCE\]", - r"", - r"", # remaining punctuation r"\.", r",", @@ -92,12 +104,15 @@ class Normalizer: def normalize(self, text: str) -> str: text = text.upper() - # first replace + # first remove + text = self.remove_regexp_before.sub("", text) + + # then replace for pattern, sub in self.replace_regexps: text = pattern.sub(sub, text) # then remove - text = self.remove_regexp.sub("", text) + text = self.remove_regexp_after.sub("", text) # then clean up whitespace text = self.whitespace_regexp.sub(" ", text).strip() @@ -159,6 +174,8 @@ def test(): "-[ADV]AN[TAGE]", "-[ADV]AN[TAGE]-", "[WEA[SONABLE]-/REASONABLE]", + "[VOCALIZED-NOISE]-", + "~BULL", ]: print(text) print(normalizer.normalize(text)) diff --git a/egs/fisher_swbd/ASR/prepare.sh b/egs/fisher_swbd/ASR/prepare.sh index dc89dfdac..859e0d34e 100755 --- a/egs/fisher_swbd/ASR/prepare.sh +++ b/egs/fisher_swbd/ASR/prepare.sh @@ -56,12 +56,6 @@ log() { log "dl_dir: $dl_dir" -if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then - log "Stage -1: Download LM" - #[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm - #./local/download_lm.py --out-dir=$dl_dir/lm -fi - if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" @@ -116,35 +110,60 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then set -x + # Combine Fisher and SWBD recordings and supervisions lhotse combine \ data/manifests/fisher/recordings.jsonl.gz \ data/manifests/swbd/swbd_recordings.jsonl \ data/manifests/fisher-swbd_recordings.jsonl.gz - lhotse combine \ data/manifests/fisher/supervisions.jsonl.gz \ data/manifests/swbd/swbd_supervisions.jsonl \ data/manifests/fisher-swbd_supervisions.jsonl.gz + # Normalize text and remove supervisions that are not useful / hard to handle. python local/normalize_and_filter_supervisions.py \ data/manifests/fisher-swbd_supervisions.jsonl.gz \ data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \ + # Create cuts that span whole recording sessions. lhotse cut simple \ -r data/manifests/fisher-swbd_recordings.jsonl.gz \ -s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \ data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz + # Shuffle the cuts (pure bash pipes are fast). + # We could technically skip this step but this helps ensure + # SWBD is not only seen towards the end of training. gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \ | shuf \ | gzip -c \ > data/manifests/fisher-swbd_cuts.jsonl.gz + # Create train/dev split -- 20 sessions for dev is about ~2h, should be good. + num_cuts="$(gunzip -c data/manifests/fisher-swbd_cuts.jsonl.gz | wc -l)" + num_dev_sessions=20 + lhotse subset --first $num_dev_sessions \ + data/manifests/fisher-swbd_cuts.jsonl.gz \ + data/manifests/dev_fisher-swbd_cuts.jsonl.gz + lhotse subset --last $((num_cuts-num_dev_sessions)) \ + data/manifests/fisher-swbd_cuts.jsonl.gz \ + data/manifests/train_fisher-swbd_cuts.jsonl.gz + + # Finally, split the full-session cuts into one cut per supervision segment. + # In case any segments are overlapping we would discard the info about overlaps. + # (overlaps are unlikely for this dataset because each cut sees only one channel). + lhotse cut trim-to-supervisions \ + --discard-overlapping \ + data/manifests/train_fisher-swbd_cuts.jsonl.gz \ + data/manifests/train_utterances_fisher-swbd_cuts.jsonl.gz + lhotse cut trim-to-supervisions \ + --discard-overlapping \ + data/manifests/dev_fisher-swbd_cuts.jsonl.gz \ + data/manifests/dev_utterances_fisher-swbd_cuts.jsonl.gz + set +x fi -# TODO: optional stage 5, compute features - if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "Stage 6: Dump transcripts for LM training" mkdir -p data/lm @@ -154,18 +173,6 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then > data/lm/transcript_words.txt fi -#if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then -# log "Stage 3: Compute fbank for librispeech" -# mkdir -p data/fbank -# ./local/compute_fbank_librispeech.py -#fi -# -#if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then -# log "Stage 4: Compute fbank for musan" -# mkdir -p data/fbank -# ./local/compute_fbank_musan.py -#fi - if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then log "Stage 7: Prepare lexicon using g2p_en" lang_dir=data/lang_phone @@ -186,6 +193,12 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then | awk '{print $0,NR+2}' \ >> $lang_dir/words.txt + # Add remaining special word symbols expected by LM scripts. + num_words=$(wc -l $lang_dir/words.txt) + echo " $((num_words))" + echo " $((num_words+1))" + echo "#0 $((num_words+2))" + if [ ! -f $lang_dir/L_disambig.pt ]; then pip install g2p_en ./local/prepare_lang_g2pen.py --lang-dir $lang_dir