mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
some bugfixes
This commit is contained in:
parent
83e2b30a22
commit
7e1e9f8da3
24
egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
Normal file → Executable file
24
egs/fisher_swbd/ASR/local/normalize_and_filter_supervisions.py
Normal file → Executable file
@ -4,10 +4,9 @@ import argparse
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from lhotse import SupervisionSet, SupervisionSegment
|
||||
from lhotse import SupervisionSegment, SupervisionSet
|
||||
from lhotse.serialization import load_manifest_lazy_or_eager
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def get_args():
|
||||
@ -19,19 +18,14 @@ def get_args():
|
||||
|
||||
# fmt: off
|
||||
class FisherSwbdNormalizer:
|
||||
"""Note: the functions "normalize" and "keep" implement the logic
|
||||
similar to Kaldi's data prep scripts for Fisher and SWBD: One
|
||||
notable difference is that we don't change [cough], [lipsmack],
|
||||
etc. to [noise]. We also don't implement all the edge cases of
|
||||
normalization from Kaldi (hopefully won't make too much
|
||||
difference).
|
||||
|
||||
"""
|
||||
Note: the functions "normalize" and "keep" implement the logic similar to
|
||||
Kaldi's data prep scripts for Fisher:
|
||||
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/fisher_data_prep.sh
|
||||
and for SWBD:
|
||||
https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
|
||||
|
||||
One notable difference is that we don't change [cough], [lipsmack], etc. to [noise].
|
||||
We also don't implement all the edge cases of normalization from Kaldi
|
||||
(hopefully won't make too much difference).
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
|
||||
self.remove_regexp_before = re.compile(
|
||||
|
@ -28,8 +28,8 @@ stop_stage=500
|
||||
# - noise
|
||||
# - speech
|
||||
|
||||
dl_dir=$PWD/download
|
||||
mkdir -p $dl_dir
|
||||
dl_dir=/mnt/dsk2
|
||||
#mkdir -p $dl_dir
|
||||
|
||||
. shared/parse_options.sh || exit 1
|
||||
|
||||
@ -62,25 +62,25 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
#
|
||||
|
||||
# TODO: remove
|
||||
LDC_ROOT=/nas/data4/DATA
|
||||
for pkg in LDC2004S13 LDC2004T19 LDC2005S13 LDC2005T19 LDC97S62 LDC2002S09 LDC2002T43; do
|
||||
ln -sfv $LDC_ROOT/$pkg $dl_dir/
|
||||
done
|
||||
# LDC_ROOT=/nas/data4/DATA
|
||||
# for pkg in LDC2004S13 LDC2004T19 LDC2005S13 LDC2005T19 LDC97S62 LDC2002S09 LDC2002T43; do
|
||||
# ln -sfv $LDC_ROOT/$pkg $dl_dir/
|
||||
# done
|
||||
|
||||
# If you have pre-downloaded it to /path/to/musan,
|
||||
# you can create a symlink
|
||||
#
|
||||
# ln -sfv /path/to/musan $dl_dir/
|
||||
#
|
||||
if [ ! -d $dl_dir/musan ]; then
|
||||
lhotse download musan $dl_dir
|
||||
fi
|
||||
# if [ ! -d $dl_dir/musan ]; then
|
||||
# lhotse download musan $dl_dir
|
||||
# fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ] ; then
|
||||
log "Stage 1: Prepare Fisher manifests"
|
||||
mkdir -p data/manifests/fisher
|
||||
lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
|
||||
# mkdir -p data/manifests/fisher
|
||||
# lhotse prepare fisher-english --absolute-paths 1 $dl_dir data/manifests/fisher
|
||||
local/normalize_and_filter_supervisions.py data/manifests/fisher/supervisions.jsonl.gz data/manifests/supervisions_fisher.jsonl.gz
|
||||
cp data/manifests/fisher/recordings.jsonl.gz data/manifests/recordings_fisher.jsonl.gz
|
||||
gzip -d data/manifests/supervisions_fisher.jsonl.gz
|
||||
@ -91,10 +91,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Prepare SWBD manifests"
|
||||
mkdir -p data/manifests/swbd
|
||||
lhotse prepare switchboard --absolute-paths 1 --omit-silence $dl_dir/LDC97S62 data/manifests/swbd
|
||||
python3 local/normalize_and_filter_supervisions.py data/manifests/swbd/swbd_supervisions_all.jsonl.gz data/manifests/supervisions_swbd.jsonl.gz
|
||||
cp data/manifests/swbd/swbd_recordings_all.jsonl.gz data/manifests/recordings_swbd.jsonl.gz
|
||||
gzip -d data/manifests/supervisions_swbd.jsonl.gz
|
||||
gzip -d data/manifests/recordings_swbd.jsonl.gz
|
||||
python3 local/normalize_and_filter_supervisions.py data/manifests/swbd/swbd_supervisions.jsonl data/manifests/supervisions_swbd.jsonl
|
||||
cp data/manifests/swbd/swbd_recordings.jsonl data/manifests/recordings_swbd.jsonl
|
||||
# gzip -d data/manifests/supervisions_swbd.jsonl.gz
|
||||
# gzip -d data/manifests/recordings_swbd.jsonl.gz
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
|
Loading…
x
Reference in New Issue
Block a user