mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 10:16:14 +00:00
some bugfixes
This commit is contained in:
parent
83e2b30a22
commit
7e1e9f8da3
28
egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
Normal file → Executable file
28
egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
Normal file → Executable file
@ -4,10 +4,9 @@ import argparse
|
|||||||
import re
|
import re
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from tqdm import tqdm
|
from lhotse import SupervisionSegment, SupervisionSet
|
||||||
|
|
||||||
from lhotse import SupervisionSet, SupervisionSegment
|
|
||||||
from lhotse.serialization import load_manifest_lazy_or_eager
|
from lhotse.serialization import load_manifest_lazy_or_eager
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
@ -19,19 +18,14 @@ def get_args():
|
|||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
class FisherSwbdNormalizer:
|
class FisherSwbdNormalizer:
|
||||||
|
"""Note: the functions "normalize" and "keep" implement the logic
|
||||||
|
similar to Kaldi's data prep scripts for Fisher and SWBD: One
|
||||||
|
notable difference is that we don't change [cough], [lipsmack],
|
||||||
|
etc. to [noise]. We also don't implement all the edge cases of
|
||||||
|
normalization from Kaldi (hopefully won't make too much
|
||||||
|
difference).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Note: the functions "normalize" and "keep" implement the logic similar to
|
|
||||||
Kaldi's data prep scripts for Fisher:
|
|
||||||
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
|
|
||||||
and for SWBD:
|
|
||||||
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
|
|
||||||
|
|
||||||
One notable difference is that we don't change [cough], [lipsmack], etc. to [noise].
|
|
||||||
We also don't implement all the edge cases of normalization from Kaldi
|
|
||||||
(hopefully won't make too much difference).
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
|
||||||
self.remove_regexp_before = re.compile(
|
self.remove_regexp_before = re.compile(
|
||||||
@ -51,10 +45,10 @@ class FisherSwbdNormalizer:
|
|||||||
# We don't do that here.
|
# We don't do that here.
|
||||||
# We also uppercase the text as the first operation.
|
# We also uppercase the text as the first operation.
|
||||||
self.replace_regexps: Tuple[re.Pattern, str] = [
|
self.replace_regexps: Tuple[re.Pattern, str] = [
|
||||||
# SWBD:
|
# SWBD:
|
||||||
# [LAUGHTER-STORY] -> STORY
|
# [LAUGHTER-STORY] -> STORY
|
||||||
(re.compile(r"\[LAUGHTER-(.*?)\]"), r"\1"),
|
(re.compile(r"\[LAUGHTER-(.*?)\]"), r"\1"),
|
||||||
# [WEA[SONABLE]-/REASONABLE]
|
# [WEA[SONABLE]-/REASONABLE]
|
||||||
(re.compile(r"\[\S+/(\S+)\]"), r"\1"),
|
(re.compile(r"\[\S+/(\S+)\]"), r"\1"),
|
||||||
# -[ADV]AN[TAGE]- -> AN
|
# -[ADV]AN[TAGE]- -> AN
|
||||||
(re.compile(r"-?\[.*?\](\w+)\[.*?\]-?"), r"\1-"),
|
(re.compile(r"-?\[.*?\](\w+)\[.*?\]-?"), r"\1-"),
|
||||||
|
@ -8,7 +8,7 @@ stop_stage=500
|
|||||||
|
|
||||||
# We assume dl_dir (download dir) contains the following
|
# We assume dl_dir (download dir) contains the following
|
||||||
# directories and files. Most of them can't be downloaded automatically
|
# directories and files. Most of them can't be downloaded automatically
|
||||||
# as they are not publically available and require a license purchased
|
# as they are not publically available and require a license purchased
|
||||||
# from the LDC.
|
# from the LDC.
|
||||||
#
|
#
|
||||||
# - $dl_dir/{LDC2004S13,LDC2004T19,LDC2005S13,LDC2005T19}
|
# - $dl_dir/{LDC2004S13,LDC2004T19,LDC2005S13,LDC2005T19}
|
||||||
@ -28,8 +28,8 @@ stop_stage=500
|
|||||||
# - noise
|
# - noise
|
||||||
# - speech
|
# - speech
|
||||||
|
|
||||||
dl_dir=$PWD/download
|
dl_dir=/mnt/dsk2
|
||||||
mkdir -p $dl_dir
|
#mkdir -p $dl_dir
|
||||||
|
|
||||||
. shared/parse_options.sh || exit 1
|
. shared/parse_options.sh || exit 1
|
||||||
|
|
||||||
@ -62,25 +62,25 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|||||||
#
|
#
|
||||||
|
|
||||||
# TODO: remove
|
# TODO: remove
|
||||||
LDC_ROOT=/nas/data4/DATA
|
# LDC_ROOT=/nas/data4/DATA
|
||||||
for pkg in LDC2004S13 LDC2004T19 LDC2005S13 LDC2005T19 LDC97S62 LDC2002S09 LDC2002T43; do
|
# for pkg in LDC2004S13 LDC2004T19 LDC2005S13 LDC2005T19 LDC97S62 LDC2002S09 LDC2002T43; do
|
||||||
ln -sfv $LDC_ROOT/$pkg $dl_dir/
|
# ln -sfv $LDC_ROOT/$pkg $dl_dir/
|
||||||
done
|
# done
|
||||||
|
|
||||||
# If you have pre-downloaded it to /path/to/musan,
|
# If you have pre-downloaded it to /path/to/musan,
|
||||||
# you can create a symlink
|
# you can create a symlink
|
||||||
#
|
#
|
||||||
# ln -sfv /path/to/musan $dl_dir/
|
# ln -sfv /path/to/musan $dl_dir/
|
||||||
#
|
#
|
||||||
if [ ! -d $dl_dir/musan ]; then
|
# if [ ! -d $dl_dir/musan ]; then
|
||||||
lhotse download musan $dl_dir
|
# lhotse download musan $dl_dir
|
||||||
fi
|
# fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] ; then
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] ; then
|
||||||
log "Stage 1: Prepare Fisher manifests"
|
log "Stage 1: Prepare Fisher manifests"
|
||||||
mkdir -p data/manifests/fisher
|
# mkdir -p data/manifests/fisher
|
||||||
lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
|
# lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
|
||||||
local/normalize_and_filter_supervisions.py data/manifests/fisher/supervisions.jsonl.gz data/manifests/supervisions_fisher.jsonl.gz
|
local/normalize_and_filter_supervisions.py data/manifests/fisher/supervisions.jsonl.gz data/manifests/supervisions_fisher.jsonl.gz
|
||||||
cp data/manifests/fisher/recordings.jsonl.gz data/manifests/recordings_fisher.jsonl.gz
|
cp data/manifests/fisher/recordings.jsonl.gz data/manifests/recordings_fisher.jsonl.gz
|
||||||
gzip -d data/manifests/supervisions_fisher.jsonl.gz
|
gzip -d data/manifests/supervisions_fisher.jsonl.gz
|
||||||
@ -91,10 +91,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|||||||
log "Stage 2: Prepare SWBD manifests"
|
log "Stage 2: Prepare SWBD manifests"
|
||||||
mkdir -p data/manifests/swbd
|
mkdir -p data/manifests/swbd
|
||||||
lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd
|
lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd
|
||||||
python3 local/normalize_and_filter_supervisions.py data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/supervisions_swbd.jsonl.gz
|
python3 local/normalize_and_filter_supervisions.py data/manifests/swbd/swbd_supervisions.jsonl data/manifests/supervisions_swbd.jsonl
|
||||||
cp data/manifests/swbd/swbd_recordings_all.jsonl.gz data/manifests/recordings_swbd.jsonl.gz
|
cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl
|
||||||
gzip -d data/manifests/supervisions_swbd.jsonl.gz
|
# gzip -d data/manifests/supervisions_swbd.jsonl.gz
|
||||||
gzip -d data/manifests/recordings_swbd.jsonl.gz
|
# gzip -d data/manifests/recordings_swbd.jsonl.gz
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
@ -116,16 +116,16 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
#####################################
|
#####################################
|
||||||
#fisher
|
#fisher
|
||||||
#####################################
|
#####################################
|
||||||
|
|
||||||
gzip -d data/fbank/cuts_fisher.json.gz
|
gzip -d data/fbank/cuts_fisher.json.gz
|
||||||
jq -c '.[]' data/fbank/cuts_fisher.json > data/fbank/cuts_fisher.jsonl
|
jq -c '.[]' data/fbank/cuts_fisher.json > data/fbank/cuts_fisher.jsonl
|
||||||
gzip -c data/fbank/cuts_fisher.jsonl > data/fbank/cuts_fisher.jsonl.gz
|
gzip -c data/fbank/cuts_fisher.jsonl > data/fbank/cuts_fisher.jsonl.gz
|
||||||
|
|
||||||
# extract list of sph
|
# extract list of sph
|
||||||
python3 local/extract_list_of_sph.py data/fbank/cuts_fisher.jsonl | sort | uniq > data/fbank/cuts_fisher_sph.list
|
python3 local/extract_list_of_sph.py data/fbank/cuts_fisher.jsonl | sort | uniq > data/fbank/cuts_fisher_sph.list
|
||||||
|
|
||||||
num_fisher_total_session=$(wc -l <data/fbank/cuts_fisher_sph.list)
|
num_fisher_total_session=$(wc -l <data/fbank/cuts_fisher_sph.list)
|
||||||
num_fisher_dev_session=10
|
num_fisher_dev_session=10
|
||||||
num_fisher_train_session=$(($num_fisher_total_session - $num_fisher_dev_session))
|
num_fisher_train_session=$(($num_fisher_total_session - $num_fisher_dev_session))
|
||||||
head -n $num_fisher_dev_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_dev.list
|
head -n $num_fisher_dev_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_dev.list
|
||||||
tail -n $num_fisher_train_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_train.list
|
tail -n $num_fisher_train_session data/fbank/cuts_fisher_sph.list >data/fbank/cuts_fisher_sph_train.list
|
||||||
@ -137,7 +137,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
# extract train json
|
# extract train json
|
||||||
python3 local/extract_json_cuts.py data/fbank/cuts_fisher_sph_train.list data/fbank/cuts_fisher.jsonl data/fbank/train_cuts_fisher.jsonl
|
python3 local/extract_json_cuts.py data/fbank/cuts_fisher_sph_train.list data/fbank/cuts_fisher.jsonl data/fbank/train_cuts_fisher.jsonl
|
||||||
gzip -c data/fbank/train_cuts_fisher.jsonl > data/fbank/train_cuts_fisher.jsonl.gz
|
gzip -c data/fbank/train_cuts_fisher.jsonl > data/fbank/train_cuts_fisher.jsonl.gz
|
||||||
|
|
||||||
# describe cut
|
# describe cut
|
||||||
lhotse cut describe data/fbank/train_cuts_fisher.jsonl.gz
|
lhotse cut describe data/fbank/train_cuts_fisher.jsonl.gz
|
||||||
lhotse cut describe data/fbank/dev_cuts_fisher.jsonl.gz
|
lhotse cut describe data/fbank/dev_cuts_fisher.jsonl.gz
|
||||||
@ -145,7 +145,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
# extract dev supervision
|
# extract dev supervision
|
||||||
python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_dev.list data/manifests/supervisions_fisher.jsonl data/manifests/dev_supervisions_fisher.jsonl
|
python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_dev.list data/manifests/supervisions_fisher.jsonl data/manifests/dev_supervisions_fisher.jsonl
|
||||||
python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_train.list data/manifests/supervisions_fisher.jsonl data/manifests/train_supervisions_fisher.jsonl
|
python local/extract_json_supervision.py data/fbank/cuts_fisher_sph_train.list data/manifests/supervisions_fisher.jsonl data/manifests/train_supervisions_fisher.jsonl
|
||||||
|
|
||||||
######################################
|
######################################
|
||||||
#swbd
|
#swbd
|
||||||
######################################
|
######################################
|
||||||
@ -156,7 +156,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
|
|
||||||
python3 local/extract_list_of_sph.py data/fbank/cuts_swbd.jsonl| sort | uniq > data/fbank/cuts_swbd_sph.list
|
python3 local/extract_list_of_sph.py data/fbank/cuts_swbd.jsonl| sort | uniq > data/fbank/cuts_swbd_sph.list
|
||||||
num_swbd_total_session=$(wc -l <data/fbank/cuts_swbd_sph.list)
|
num_swbd_total_session=$(wc -l <data/fbank/cuts_swbd_sph.list)
|
||||||
num_swbd_dev_session=10
|
num_swbd_dev_session=10
|
||||||
num_swbd_train_session=$(($num_swbd_total_session - $num_swbd_dev_session))
|
num_swbd_train_session=$(($num_swbd_total_session - $num_swbd_dev_session))
|
||||||
|
|
||||||
head -n $num_swbd_dev_session data/fbank/cuts_swbd_sph.list >data/fbank/cuts_swbd_sph_dev.list
|
head -n $num_swbd_dev_session data/fbank/cuts_swbd_sph.list >data/fbank/cuts_swbd_sph_dev.list
|
||||||
@ -168,22 +168,22 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
|
|
||||||
python3 local/extract_json_cuts.py data/fbank/cuts_swbd_sph_train.list data/fbank/cuts_swbd.jsonl data/fbank/train_cuts_swbd.jsonl
|
python3 local/extract_json_cuts.py data/fbank/cuts_swbd_sph_train.list data/fbank/cuts_swbd.jsonl data/fbank/train_cuts_swbd.jsonl
|
||||||
gzip -c data/fbank/train_cuts_swbd.jsonl > data/fbank/train_cuts_swbd.jsonl.gz
|
gzip -c data/fbank/train_cuts_swbd.jsonl > data/fbank/train_cuts_swbd.jsonl.gz
|
||||||
|
|
||||||
# describe cut
|
# describe cut
|
||||||
lhotse cut describe data/fbank/train_cuts_swbd.jsonl.gz
|
lhotse cut describe data/fbank/train_cuts_swbd.jsonl.gz
|
||||||
lhotse cut describe data/fbank/dev_cuts_swbd.jsonl.gz
|
lhotse cut describe data/fbank/dev_cuts_swbd.jsonl.gz
|
||||||
|
|
||||||
# extract dev supervision
|
# extract dev supervision
|
||||||
python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_dev.list data/manifests/supervisions_swbd.jsonl data/manifests/dev_supervisions_swbd.jsonl
|
python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_dev.list data/manifests/supervisions_swbd.jsonl data/manifests/dev_supervisions_swbd.jsonl
|
||||||
python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_train.list data/manifests/supervisions_swbd.jsonl data/manifests/train_supervisions_swbd.jsonl
|
python local/extract_json_supervision.py data/fbank/cuts_swbd_sph_train.list data/manifests/supervisions_swbd.jsonl data/manifests/train_supervisions_swbd.jsonl
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||||
log "Stage 3: Prepare musan manifest"
|
log "Stage 3: Prepare musan manifest"
|
||||||
# We assume that you have downloaded the musan corpus
|
# We assume that you have downloaded the musan corpus
|
||||||
# to data/musan
|
# to data/musan
|
||||||
mkdir -p data/manifests/musan
|
mkdir -p data/manifests/musan
|
||||||
lhotse prepare musan $dl_dir/musan data/manifests/musan
|
lhotse prepare musan $dl_dir/musan data/manifests/musan
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||||
|
Loading…
x
Reference in New Issue
Block a user