Missing steps in prepare.sh

This commit is contained in:
Piotr Żelasko 2022-01-15 01:27:31 +00:00
parent ccab93e8e2
commit a7666d864c
2 changed files with 68 additions and 38 deletions

View File

@ -17,15 +17,34 @@ def get_args():
return parser.parse_args()
# Note: the functions "normalize" and "keep" implement the logic similar to
# Kaldi's data prep scripts for Fisher:
# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
# and for SWBD:
# https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
class FisherSwbdNormalizer:
"""
Note: the functions "normalize" and "keep" implement the logic similar to
Kaldi's data prep scripts for Fisher:
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
and for SWBD:
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
One notable difference is that we don't change [cough], [lipsmack], etc. to [noise].
We also don't implement all the edge cases of normalization from Kaldi
(hopefully won't make too much difference).
"""
class Normalizer:
def __init__(self) -> None:
self.remove_regexp_before = re.compile(
r"|".join([
# special symbols
r"\[\[SKIP.*\]\]",
r"\[SKIP.*\]",
r"\[PAUSE.*\]",
r"\[SILENCE\]",
r"<B_ASIDE>",
r"<E_ASIDE>",
])
)
# tuples of (pattern, replacement)
# note: Kaldi replaces sighs, coughs, etc with [noise].
# We don't do that here.
@ -63,19 +82,12 @@ class Normalizer:
(re.compile(r"\s-\s"), r" "),
(re.compile(r"\s-\s"), r" "),
# special symbol with trailing dash
(re.compile(r"(\[\w+\])-"), r"\1"),
(re.compile(r"(\[.*?\])-"), r"\1"),
]
# unwanted symbols in the transcripts
self.remove_regexp = re.compile(
self.remove_regexp_after = re.compile(
r"|".join([
# special symbols
r"\[\[SKIP.*\]\]",
r"\[SKIP.*\]",
r"\[PAUSE.*\]",
r"\[SILENCE\]",
r"<B_ASIDE>",
r"<E_ASIDE>",
# remaining punctuation
r"\.",
r",",
@ -92,12 +104,15 @@ class Normalizer:
def normalize(self, text: str) -> str:
text = text.upper()
# first replace
# first remove
text = self.remove_regexp_before.sub("", text)
# then replace
for pattern, sub in self.replace_regexps:
text = pattern.sub(sub, text)
# then remove
text = self.remove_regexp.sub("", text)
text = self.remove_regexp_after.sub("", text)
# then clean up whitespace
text = self.whitespace_regexp.sub(" ", text).strip()
@ -159,6 +174,8 @@ def test():
"-[ADV]AN[TAGE]",
"-[ADV]AN[TAGE]-",
"[WEA[SONABLE]-/REASONABLE]",
"[VOCALIZED-NOISE]-",
"~BULL",
]:
print(text)
print(normalizer.normalize(text))

View File

@ -56,12 +56,6 @@ log() {
log "dl_dir: $dl_dir"
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: Download LM"
#[ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm
#./local/download_lm.py --out-dir=$dl_dir/lm
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
@ -116,35 +110,60 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
set -x
# Combine Fisher and SWBD recordings and supervisions
lhotse combine \
data/manifests/fisher/recordings.jsonl.gz \
data/manifests/swbd/swbd_recordings.jsonl \
data/manifests/fisher-swbd_recordings.jsonl.gz
lhotse combine \
data/manifests/fisher/supervisions.jsonl.gz \
data/manifests/swbd/swbd_supervisions.jsonl \
data/manifests/fisher-swbd_supervisions.jsonl.gz
# Normalize text and remove supervisions that are not useful / hard to handle.
python local/normalize_and_filter_supervisions.py \
data/manifests/fisher-swbd_supervisions.jsonl.gz \
data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
# Create cuts that span whole recording sessions.
lhotse cut simple \
-r data/manifests/fisher-swbd_recordings.jsonl.gz \
-s data/manifests/fisher-swbd_supervisions_norm.jsonl.gz \
data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz
# Shuffle the cuts (pure bash pipes are fast).
# We could technically skip this step but this helps ensure
# SWBD is not only seen towards the end of training.
gunzip -c data/manifests/fisher-swbd_cuts_unshuf.jsonl.gz \
| shuf \
| gzip -c \
> data/manifests/fisher-swbd_cuts.jsonl.gz
# Create train/dev split -- 20 sessions for dev is about ~2h, should be good.
num_cuts="$(gunzip -c data/manifests/fisher-swbd_cuts.jsonl.gz | wc -l)"
num_dev_sessions=20
lhotse subset --first $num_dev_sessions \
data/manifests/fisher-swbd_cuts.jsonl.gz \
data/manifests/dev_fisher-swbd_cuts.jsonl.gz
lhotse subset --last $((num_cuts-num_dev_sessions)) \
data/manifests/fisher-swbd_cuts.jsonl.gz \
data/manifests/train_fisher-swbd_cuts.jsonl.gz
# Finally, split the full-session cuts into one cut per supervision segment.
# In case any segments are overlapping we would discard the info about overlaps.
# (overlaps are unlikely for this dataset because each cut sees only one channel).
lhotse cut trim-to-supervisions \
--discard-overlapping \
data/manifests/train_fisher-swbd_cuts.jsonl.gz \
data/manifests/train_utterances_fisher-swbd_cuts.jsonl.gz
lhotse cut trim-to-supervisions \
--discard-overlapping \
data/manifests/dev_fisher-swbd_cuts.jsonl.gz \
data/manifests/dev_utterances_fisher-swbd_cuts.jsonl.gz
set +x
fi
# TODO: optional stage 5, compute features
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Dump transcripts for LM training"
mkdir -p data/lm
@ -154,18 +173,6 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
> data/lm/transcript_words.txt
fi
#if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
# log "Stage 3: Compute fbank for librispeech"
# mkdir -p data/fbank
# ./local/compute_fbank_librispeech.py
#fi
#
#if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
# log "Stage 4: Compute fbank for musan"
# mkdir -p data/fbank
# ./local/compute_fbank_musan.py
#fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Prepare lexicon using g2p_en"
lang_dir=data/lang_phone
@ -186,6 +193,12 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
| awk '{print $0,NR+2}' \
>> $lang_dir/words.txt
# Add remaining special word symbols expected by LM scripts.
num_words=$(wc -l $lang_dir/words.txt)
echo "<s> $((num_words))"
echo "</s> $((num_words+1))"
echo "#0 $((num_words+2))"
if [ ! -f $lang_dir/L_disambig.pt ]; then
pip install g2p_en
./local/prepare_lang_g2pen.py --lang-dir $lang_dir