mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-09 14:05:33 +00:00
Finish preparing training datasets.
This commit is contained in:
parent
fb1e2ffdc1
commit
7cbd6d11ba
109
egs/librispeech/ASR/prepare_giga_speech.sh
Executable file
109
egs/librispeech/ASR/prepare_giga_speech.sh
Executable file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eou pipefail
|
||||
|
||||
nj=15
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
# We assume dl_dir (download dir) contains the following
|
||||
# directories and files. If not, they will be downloaded
|
||||
# by this script automatically.
|
||||
#
|
||||
# - $dl_dir/GigaSpeech
|
||||
# You can find audio, dict, GigaSpeech.json inside it.
|
||||
# You can apply for the download credentials by following
|
||||
# https://github.com/SpeechColab/GigaSpeech#download
|
||||
|
||||
# Number of hours for GigaSpeech subsets
|
||||
# XL 10k hours
|
||||
# L 2.5k hours
|
||||
# M 1k hours
|
||||
# S 250 hours
|
||||
# XS 10 hours
|
||||
# DEV 12 hours
|
||||
# Test 40 hours
|
||||
|
||||
dl_dir=$PWD/download
|
||||
|
||||
. shared/parse_options.sh || exit 1
|
||||
|
||||
# All files generated by this script are saved in "data".
|
||||
# You can safely remove "data" and rerun this script to regenerate it.
|
||||
mkdir -p data
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
log "dl_dir: $dl_dir"
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
log "Stage 0: Download data"
|
||||
|
||||
[ ! -e $dl_dir/GigaSpeech ] && mkdir -p $dl_dir/GigaSpeech
|
||||
|
||||
# If you have pre-downloaded it to /path/to/GigaSpeech,
|
||||
# you can create a symlink
|
||||
#
|
||||
# ln -sfv /path/to/GigaSpeech $dl_dir/GigaSpeech
|
||||
#
|
||||
if [ ! -d $dl_dir/GigaSpeech/audio ] && [ ! -f $dl_dir/GigaSpeech.json ]; then
|
||||
# Check credentials.
|
||||
if [ ! -f $dl_dir/password ]; then
|
||||
echo -n "$0: Please apply for the download credentials by following"
|
||||
echo -n "https://github.com/SpeechColab/GigaSpeech#dataset-download"
|
||||
echo " and save it to $dl_dir/password."
|
||||
exit 1;
|
||||
fi
|
||||
PASSWORD=`cat $dl_dir/password 2>/dev/null`
|
||||
if [ -z "$PASSWORD" ]; then
|
||||
echo "$0: Error, $dl_dir/password is empty."
|
||||
exit 1;
|
||||
fi
|
||||
PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1`
|
||||
if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then
|
||||
echo "$0: Error, invalid $dl_dir/password."
|
||||
exit 1;
|
||||
fi
|
||||
# Download XL, DEV and TEST sets by default.
|
||||
lhotse download gigaspeech \
|
||||
--subset XL \
|
||||
--subset L \
|
||||
--subset M \
|
||||
--subset S \
|
||||
--subset XS \
|
||||
--subset DEV \
|
||||
--subset TEST \
|
||||
--host tsinghua \
|
||||
$dl_dir/password $dl_dir/GigaSpeech
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)"
|
||||
# We assume that you have downloaded the GigaSpeech corpus
|
||||
# to $dl_dir/GigaSpeech
|
||||
mkdir -p data/manifests
|
||||
lhotse prepare gigaspeech \
|
||||
--subset XL \
|
||||
--subset L \
|
||||
--subset M \
|
||||
--subset S \
|
||||
--subset XS \
|
||||
--subset DEV \
|
||||
--subset TEST \
|
||||
-j $nj \
|
||||
$dl_dir/GigaSpeech data/manifests
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Preprocess GigaSpeech manifest"
|
||||
if [ ! -f data/fbank/.preprocess_complete ]; then
|
||||
log "It may take 2 hours for this stage"
|
||||
python3 ./local/preprocess_gigaspeech.py
|
||||
touch data/fbank/.preprocess_complete
|
||||
fi
|
||||
fi
|
||||
@ -16,12 +16,28 @@
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
SpecAugment,
|
||||
)
|
||||
from lhotse.dataset.input_strategies import (
|
||||
OnTheFlyFeatures,
|
||||
PrecomputedFeatures,
|
||||
)
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from lhotse import CutSet
|
||||
from icefall.utils import str2bool
|
||||
|
||||
|
||||
class AsrDataset:
|
||||
class AsrDataModule:
|
||||
def __init__(self, args: argparse.Namespace):
|
||||
self.args = args
|
||||
|
||||
@ -55,19 +71,11 @@ class AsrDataset:
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the BucketingSampler"
|
||||
help="The number of buckets for the BucketingSampler "
|
||||
"and DynamicBucketingSampler."
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--on-the-fly-feats",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="When enabled, use on-the-fly cut mixing and feature "
|
||||
"extraction. Will drop existing precomputed feature manifests "
|
||||
"if available.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--shuffle",
|
||||
type=str2bool,
|
||||
@ -126,8 +134,25 @@ class AsrDataset:
|
||||
)
|
||||
|
||||
def train_dataloaders(
|
||||
self, cuts_train: CutSet, cuts_musan: Optional[CutSet] = None
|
||||
self,
|
||||
cuts_train: CutSet,
|
||||
dynamic_bucketing: bool,
|
||||
on_the_fly_feats: bool,
|
||||
cuts_musan: Optional[CutSet] = None,
|
||||
) -> DataLoader:
|
||||
"""
|
||||
Args:
|
||||
cuts_train:
|
||||
Cuts for training.
|
||||
cuts_musan:
|
||||
If not None, it is the cuts for mixing.
|
||||
dynamic_bucketing:
|
||||
True to use DynamicBucketingSampler;
|
||||
False to use BucketingSampler.
|
||||
on_the_fly_feats:
|
||||
True to use OnTheFlyFeatures;
|
||||
False to use PrecomputedFeatures.
|
||||
"""
|
||||
transforms = []
|
||||
if cuts_musan is not None:
|
||||
logging.info("Enable MUSAN")
|
||||
@ -177,21 +202,34 @@ class AsrDataset:
|
||||
# Drop feats to be on the safe side.
|
||||
train = K2SpeechRecognitionDataset(
|
||||
cut_transforms=transforms,
|
||||
input_strategy=OnTheFlyFeatures(
|
||||
Fbank(FbankConfig(num_mel_bins=80))
|
||||
input_strategy=(
|
||||
OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
|
||||
if on_the_fly_feats
|
||||
else PrecomputedFeatures()
|
||||
),
|
||||
input_transforms=input_transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
drop_last=True,
|
||||
)
|
||||
if dynamic_bucketing:
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
logging.info("About to create train dataloader")
|
||||
train_dl = DataLoader(
|
||||
@ -17,7 +17,7 @@
|
||||
|
||||
|
||||
import logging
|
||||
from typing import Path
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
|
||||
@ -29,29 +29,47 @@ class GigaSpeech:
|
||||
manifest_dir:
|
||||
It is expected to contain the following files::
|
||||
|
||||
- cuts_L.jsonl.gz
|
||||
- cuts_XL.jsonl.gz
|
||||
- cuts_TEST.jsonl.gz
|
||||
- cuts_DEV.jsonl.gz
|
||||
- cuts_XL_raw.jsonl.gz
|
||||
- cuts_L_raw.jsonl.gz
|
||||
- cuts_M_raw.jsonl.gz
|
||||
- cuts_S_raw.jsonl.gz
|
||||
- cuts_XS_raw.jsonl.gz
|
||||
- cuts_DEV_raw.jsonl.gz
|
||||
- cuts_TEST_raw.jsonl.gz
|
||||
"""
|
||||
self.manifest_dir = Path(manifest_dir)
|
||||
|
||||
def train_L_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_L.json.gz"
|
||||
logging.info(f"About to get train-L cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_XL_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_XL.json.gz"
|
||||
f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-XL cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_L_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-L cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_M_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-M cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_S_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-S cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_XS_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-XS cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def test_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_TEST.json.gz"
|
||||
f = self.manifest_dir / "cuts_TEST.jsonl.gz"
|
||||
logging.info(f"About to get TEST cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def dev_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_DEV.json.gz"
|
||||
f = self.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||
logging.info(f"About to get DEV cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import Path
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
|
||||
|
||||
103
egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
Executable file
103
egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
Executable file
@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
To run this file, do:
|
||||
|
||||
cd icefall/egs/librispeech/ASR
|
||||
python ./transducer_stateless_multi_datasets/test_asr_datamodule.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
from asr_datamodule import AsrDataModule
|
||||
from gigaspeech import GigaSpeech
|
||||
from lhotse import load_manifest
|
||||
from librispeech import LibriSpeech
|
||||
|
||||
|
||||
def test_dataset():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
AsrDataModule.add_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.enable_musan:
|
||||
cuts_musan = load_manifest(
|
||||
Path(args.manifest_dir) / "cuts_musan.json.gz"
|
||||
)
|
||||
else:
|
||||
cuts_musan = None
|
||||
|
||||
librispeech = LibriSpeech(manifest_dir=args.manifest_dir)
|
||||
gigaspeech = GigaSpeech(manifest_dir=args.manifest_dir)
|
||||
|
||||
train_clean_100 = librispeech.train_clean_100_cuts()
|
||||
train_S = gigaspeech.train_S_cuts()
|
||||
|
||||
asr_datamodule = AsrDataModule(args)
|
||||
|
||||
libri_train_dl = asr_datamodule.train_dataloaders(
|
||||
train_clean_100,
|
||||
dynamic_bucketing=False,
|
||||
on_the_fly_feats=False,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
||||
giga_train_dl = asr_datamodule.train_dataloaders(
|
||||
train_S,
|
||||
dynamic_bucketing=True,
|
||||
on_the_fly_feats=True,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
||||
seed = 20220216
|
||||
rng = random.Random(seed)
|
||||
|
||||
for epoch in range(2):
|
||||
print("epoch", epoch)
|
||||
batch_idx = 0
|
||||
libri_train_dl.sampler.set_epoch(epoch)
|
||||
giga_train_dl.sampler.set_epoch(epoch)
|
||||
|
||||
iter_libri = iter(libri_train_dl)
|
||||
iter_giga = iter(giga_train_dl)
|
||||
while True:
|
||||
idx = rng.choices((0, 1), weights=[0.8, 0.2], k=1)[0]
|
||||
dl = iter_libri if idx == 0 else iter_giga
|
||||
batch_idx += 1
|
||||
|
||||
print("dl idx", idx, "batch_idx", batch_idx)
|
||||
batch = next(dl)
|
||||
cuts = batch["supervisions"]["cut"]
|
||||
for c in cuts:
|
||||
print(c.id)
|
||||
|
||||
if batch_idx > 10:
|
||||
break
|
||||
|
||||
|
||||
def main():
|
||||
test_dataset()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user