some bugfixes

This commit is contained in:
Nagendra Goel 2022-08-23 21:00:02 +00:00
parent 83e2b30a22
commit 7e1e9f8da3
2 changed files with 38 additions and 44 deletions

View File

@ -4,10 +4,9 @@ import argparse
import re import re
from typing import Tuple from typing import Tuple
from tqdm import tqdm from lhotse import SupervisionSegment, SupervisionSet
from lhotse import SupervisionSet, SupervisionSegment
from lhotse.serialization import load_manifest_lazy_or_eager from lhotse.serialization import load_manifest_lazy_or_eager
from tqdm import tqdm
def get_args(): def get_args():
@ -19,19 +18,14 @@ def get_args():
# fmt: off # fmt: off
class FisherSwbdNormalizer: class FisherSwbdNormalizer:
"""Note: the functions "normalize" and "keep" implement the logic
similar to Kaldi's data prep scripts for Fisher and SWBD: One
notable difference is that we don't change [cough], [lipsmack],
etc. to [noise]. We also don't implement all the edge cases of
normalization from Kaldi (hopefully won't make too much
difference).
""" """
Note: the functions "normalize" and "keep" implement the logic similar to
Kaldi's data prep scripts for Fisher:
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
and for SWBD:
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
One notable difference is that we don't change [cough], [lipsmack], etc. to [noise].
We also don't implement all the edge cases of normalization from Kaldi
(hopefully won't make too much difference).
"""
def __init__(self) -> None: def __init__(self) -> None:
self.remove_regexp_before = re.compile( self.remove_regexp_before = re.compile(
@ -51,10 +45,10 @@ class FisherSwbdNormalizer:
# We don't do that here. # We don't do that here.
# We also uppercase the text as the first operation. # We also uppercase the text as the first operation.
self.replace_regexps: Tuple[re.Pattern, str] = [ self.replace_regexps: Tuple[re.Pattern, str] = [
# SWBD: # SWBD:
# [LAUGHTER-STORY] -> STORY # [LAUGHTER-STORY] -> STORY
(re.compile(r"\[LAUGHTER-(.*?)\]"), r"\1"), (re.compile(r"\[LAUGHTER-(.*?)\]"), r"\1"),
# [WEA[SONABLE]-/REASONABLE] # [WEA[SONABLE]-/REASONABLE]
(re.compile(r"\[\S+/(\S+)\]"), r"\1"), (re.compile(r"\[\S+/(\S+)\]"), r"\1"),
# -[ADV]AN[TAGE]- -> AN # -[ADV]AN[TAGE]- -> AN
(re.compile(r"-?\[.*?\](\w+)\[.*?\]-?"), r"\1-"), (re.compile(r"-?\[.*?\](\w+)\[.*?\]-?"), r"\1-"),

View File

@ -8,7 +8,7 @@ stop_stage=500
# We assume dl_dir (download dir) contains the following # We assume dl_dir (download dir) contains the following
# directories and files. Most of them can't be downloaded automatically # directories and files. Most of them can't be downloaded automatically
# as they are not publically available and require a license purchased # as they are not publically available and require a license purchased
# from the LDC. # from the LDC.
# #
# - $dl_dir/{LDC2004S13,LDC2004T19,LDC2005S13,LDC2005T19} # - $dl_dir/{LDC2004S13,LDC2004T19,LDC2005S13,LDC2005T19}
@ -28,8 +28,8 @@ stop_stage=500
# - noise # - noise
# - speech # - speech
dl_dir=$PWD/download dl_dir=/mnt/dsk2
mkdir -p $dl_dir #mkdir -p $dl_dir
. shared/parse_options.sh || exit 1 . shared/parse_options.sh || exit 1
@ -62,25 +62,25 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
# #
# TODO: remove # TODO: remove
LDC_ROOT=/nas/data4/DATA # LDC_ROOT=/nas/data4/DATA
for pkg in LDC2004S13 LDC2004T19 LDC2005S13 LDC2005T19 LDC97S62 LDC2002S09 LDC2002T43; do # for pkg in LDC2004S13 LDC2004T19 LDC2005S13 LDC2005T19 LDC97S62 LDC2002S09 LDC2002T43; do
ln -sfv $LDC_ROOT/$pkg $dl_dir/ # ln -sfv $LDC_ROOT/$pkg $dl_dir/
done # done
# If you have pre-downloaded it to /path/to/musan, # If you have pre-downloaded it to /path/to/musan,
# you can create a symlink # you can create a symlink
# #
# ln -sfv /path/to/musan $dl_dir/ # ln -sfv /path/to/musan $dl_dir/
# #
if [ ! -d $dl_dir/musan ]; then # if [ ! -d $dl_dir/musan ]; then
lhotse download musan $dl_dir # lhotse download musan $dl_dir
fi # fi
fi fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] ; then if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] ; then
log "Stage 1: Prepare Fisher manifests" log "Stage 1: Prepare Fisher manifests"
mkdir -p data/manifests/fisher # mkdir -p data/manifests/fisher
lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher # lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
local/normalize_and_filter_supervisions.py data/manifests/fisher/supervisions.jsonl.gz data/manifests/supervisions_fisher.jsonl.gz local/normalize_and_filter_supervisions.py data/manifests/fisher/supervisions.jsonl.gz data/manifests/supervisions_fisher.jsonl.gz
cp data/manifests/fisher/recordings.jsonl.gz data/manifests/recordings_fisher.jsonl.gz cp data/manifests/fisher/recordings.jsonl.gz data/manifests/recordings_fisher.jsonl.gz
gzip -d data/manifests/supervisions_fisher.jsonl.gz gzip -d data/manifests/supervisions_fisher.jsonl.gz
@ -91,10 +91,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare SWBD manifests" log "Stage 2: Prepare SWBD manifests"
mkdir -p data/manifests/swbd mkdir -p data/manifests/swbd
lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd
python3 local/normalize_and_filter_supervisions.py data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/supervisions_swbd.jsonl.gz python3 local/normalize_and_filter_supervisions.py data/manifests/swbd/swbd_supervisions.jsonl data/manifests/supervisions_swbd.jsonl
cp data/manifests/swbd/swbd_recordings_all.jsonl.gz data/manifests/recordings_swbd.jsonl.gz cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl
gzip -d data/manifests/supervisions_swbd.jsonl.gz # gzip -d data/manifests/supervisions_swbd.jsonl.gz
gzip -d data/manifests/recordings_swbd.jsonl.gz # gzip -d data/manifests/recordings_swbd.jsonl.gz
fi fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
@ -116,16 +116,16 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
##################################### #####################################
#fisher #fisher
##################################### #####################################
gzip -d data/fbank/cuts_fisher.json.gz gzip -d data/fbank/cuts_fisher.json.gz
jq -c '.[]' data/fbank/cuts_fisher.json > data/fbank/cuts_fisher.jsonl jq -c '.[]' data/fbank/cuts_fisher.json > data/fbank/cuts_fisher.jsonl
gzip -c data/fbank/cuts_fisher.jsonl > data/fbank/cuts_fisher.jsonl.gz gzip -c data/fbank/cuts_fisher.jsonl > data/fbank/cuts_fisher.jsonl.gz
# extract list of sph # extract list of sph
python3 local/extract_list_of_sph.py data/fbank/cuts_fisher.jsonl | sort | uniq > data/fbank/cuts_fisher_sph.list python3 local/extract_list_of_sph.py data/fbank/cuts_fisher.jsonl | sort | uniq > data/fbank/cuts_fisher_sph.list
num_fisher_total_session=$(wc -l <data/fbank/cuts_fisher_sph.list) num_fisher_total_session=$(wc -l <data/fbank/cuts_fisher_sph.list)
num_fisher_dev_session=10 num_fisher_dev_session=10
num_fisher_train_session=$(($num_fisher_total_session - $num_fisher_dev_session)) num_fisher_train_session=$(($num_fisher_total_session - $num_fisher_dev_session))
head -n $num_fisher_dev_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_dev.list head -n $num_fisher_dev_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_dev.list
tail -n $num_fisher_train_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_train.list tail -n $num_fisher_train_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_train.list
@ -137,7 +137,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
# extract train json # extract train json
python3 local/extract_json_cuts.py data/fbank/cuts_fisher_sph_train.list data/fbank/cuts_fisher.jsonl data/fbank/train_cuts_fisher.jsonl python3 local/extract_json_cuts.py data/fbank/cuts_fisher_sph_train.list data/fbank/cuts_fisher.jsonl data/fbank/train_cuts_fisher.jsonl
gzip -c data/fbank/train_cuts_fisher.jsonl > data/fbank/train_cuts_fisher.jsonl.gz gzip -c data/fbank/train_cuts_fisher.jsonl > data/fbank/train_cuts_fisher.jsonl.gz
# describe cut # describe cut
lhotse cut describe data/fbank/train_cuts_fisher.jsonl.gz lhotse cut describe data/fbank/train_cuts_fisher.jsonl.gz
lhotse cut describe data/fbank/dev_cuts_fisher.jsonl.gz lhotse cut describe data/fbank/dev_cuts_fisher.jsonl.gz
@ -145,7 +145,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
# extract dev supervision # extract dev supervision
python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_dev.list data/manifests/supervisions_fisher.jsonl data/manifests/dev_supervisions_fisher.jsonl python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_dev.list data/manifests/supervisions_fisher.jsonl data/manifests/dev_supervisions_fisher.jsonl
python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_train.list data/manifests/supervisions_fisher.jsonl data/manifests/train_supervisions_fisher.jsonl python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_train.list data/manifests/supervisions_fisher.jsonl data/manifests/train_supervisions_fisher.jsonl
###################################### ######################################
#swbd #swbd
###################################### ######################################
@ -156,7 +156,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
python3 local/extract_list_of_sph.py data/fbank/cuts_swbd.jsonl| sort | uniq > data/fbank/cuts_swbd_sph.list python3 local/extract_list_of_sph.py data/fbank/cuts_swbd.jsonl| sort | uniq > data/fbank/cuts_swbd_sph.list
num_swbd_total_session=$(wc -l <data/fbank/cuts_swbd_sph.list) num_swbd_total_session=$(wc -l <data/fbank/cuts_swbd_sph.list)
num_swbd_dev_session=10 num_swbd_dev_session=10
num_swbd_train_session=$(($num_swbd_total_session - $num_swbd_dev_session)) num_swbd_train_session=$(($num_swbd_total_session - $num_swbd_dev_session))
head -n $num_swbd_dev_session data/fbank/cuts_swbd_sph.list >data/fbank/cuts_swbd_sph_dev.list head -n $num_swbd_dev_session data/fbank/cuts_swbd_sph.list >data/fbank/cuts_swbd_sph_dev.list
@ -168,22 +168,22 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
python3 local/extract_json_cuts.py data/fbank/cuts_swbd_sph_train.list data/fbank/cuts_swbd.jsonl data/fbank/train_cuts_swbd.jsonl python3 local/extract_json_cuts.py data/fbank/cuts_swbd_sph_train.list data/fbank/cuts_swbd.jsonl data/fbank/train_cuts_swbd.jsonl
gzip -c data/fbank/train_cuts_swbd.jsonl > data/fbank/train_cuts_swbd.jsonl.gz gzip -c data/fbank/train_cuts_swbd.jsonl > data/fbank/train_cuts_swbd.jsonl.gz
# describe cut # describe cut
lhotse cut describe data/fbank/train_cuts_swbd.jsonl.gz lhotse cut describe data/fbank/train_cuts_swbd.jsonl.gz
lhotse cut describe data/fbank/dev_cuts_swbd.jsonl.gz lhotse cut describe data/fbank/dev_cuts_swbd.jsonl.gz
# extract dev supervision # extract dev supervision
python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_dev.list data/manifests/supervisions_swbd.jsonl data/manifests/dev_supervisions_swbd.jsonl python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_dev.list data/manifests/supervisions_swbd.jsonl data/manifests/dev_supervisions_swbd.jsonl
python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_train.list data/manifests/supervisions_swbd.jsonl data/manifests/train_supervisions_swbd.jsonl python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_train.list data/manifests/supervisions_swbd.jsonl data/manifests/train_supervisions_swbd.jsonl
fi fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 3: Prepare musan manifest" log "Stage 3: Prepare musan manifest"
# We assume that you have downloaded the musan corpus # We assume that you have downloaded the musan corpus
# to data/musan # to data/musan
mkdir -p data/manifests/musan mkdir -p data/manifests/musan
lhotse prepare musan $dl_dir/musan data/manifests/musan lhotse prepare musan $dl_dir/musan data/manifests/musan
fi fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then