mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Finish preparing training datasets.
This commit is contained in:
parent
fb1e2ffdc1
commit
7cbd6d11ba
109
egs/librispeech/ASR/prepare_giga_speech.sh
Executable file
109
egs/librispeech/ASR/prepare_giga_speech.sh
Executable file
@ -0,0 +1,109 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -eou pipefail
|
||||||
|
|
||||||
|
nj=15
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
# We assume dl_dir (download dir) contains the following
|
||||||
|
# directories and files. If not, they will be downloaded
|
||||||
|
# by this script automatically.
|
||||||
|
#
|
||||||
|
# - $dl_dir/GigaSpeech
|
||||||
|
# You can find audio, dict, GigaSpeech.json inside it.
|
||||||
|
# You can apply for the download credentials by following
|
||||||
|
# https://github.com/SpeechColab/GigaSpeech#download
|
||||||
|
|
||||||
|
# Number of hours for GigaSpeech subsets
|
||||||
|
# XL 10k hours
|
||||||
|
# L 2.5k hours
|
||||||
|
# M 1k hours
|
||||||
|
# S 250 hours
|
||||||
|
# XS 10 hours
|
||||||
|
# DEV 12 hours
|
||||||
|
# Test 40 hours
|
||||||
|
|
||||||
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
|
. shared/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
# All files generated by this script are saved in "data".
|
||||||
|
# You can safely remove "data" and rerun this script to regenerate it.
|
||||||
|
mkdir -p data
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "dl_dir: $dl_dir"
|
||||||
|
|
||||||
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
|
log "Stage 0: Download data"
|
||||||
|
|
||||||
|
[ ! -e $dl_dir/GigaSpeech ] && mkdir -p $dl_dir/GigaSpeech
|
||||||
|
|
||||||
|
# If you have pre-downloaded it to /path/to/GigaSpeech,
|
||||||
|
# you can create a symlink
|
||||||
|
#
|
||||||
|
# ln -sfv /path/to/GigaSpeech $dl_dir/GigaSpeech
|
||||||
|
#
|
||||||
|
if [ ! -d $dl_dir/GigaSpeech/audio ] && [ ! -f $dl_dir/GigaSpeech.json ]; then
|
||||||
|
# Check credentials.
|
||||||
|
if [ ! -f $dl_dir/password ]; then
|
||||||
|
echo -n "$0: Please apply for the download credentials by following"
|
||||||
|
echo -n "https://github.com/SpeechColab/GigaSpeech#dataset-download"
|
||||||
|
echo " and save it to $dl_dir/password."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
PASSWORD=`cat $dl_dir/password 2>/dev/null`
|
||||||
|
if [ -z "$PASSWORD" ]; then
|
||||||
|
echo "$0: Error, $dl_dir/password is empty."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1`
|
||||||
|
if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then
|
||||||
|
echo "$0: Error, invalid $dl_dir/password."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
# Download XL, DEV and TEST sets by default.
|
||||||
|
lhotse download gigaspeech \
|
||||||
|
--subset XL \
|
||||||
|
--subset L \
|
||||||
|
--subset M \
|
||||||
|
--subset S \
|
||||||
|
--subset XS \
|
||||||
|
--subset DEV \
|
||||||
|
--subset TEST \
|
||||||
|
--host tsinghua \
|
||||||
|
$dl_dir/password $dl_dir/GigaSpeech
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)"
|
||||||
|
# We assume that you have downloaded the GigaSpeech corpus
|
||||||
|
# to $dl_dir/GigaSpeech
|
||||||
|
mkdir -p data/manifests
|
||||||
|
lhotse prepare gigaspeech \
|
||||||
|
--subset XL \
|
||||||
|
--subset L \
|
||||||
|
--subset M \
|
||||||
|
--subset S \
|
||||||
|
--subset XS \
|
||||||
|
--subset DEV \
|
||||||
|
--subset TEST \
|
||||||
|
-j $nj \
|
||||||
|
$dl_dir/GigaSpeech data/manifests
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
|
log "Stage 2: Preprocess GigaSpeech manifest"
|
||||||
|
if [ ! -f data/fbank/.preprocess_complete ]; then
|
||||||
|
log "It may take 2 hours for this stage"
|
||||||
|
python3 ./local/preprocess_gigaspeech.py
|
||||||
|
touch data/fbank/.preprocess_complete
|
||||||
|
fi
|
||||||
|
fi
|
||||||
@ -16,12 +16,28 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from lhotse import CutSet, Fbank, FbankConfig
|
||||||
|
from lhotse.dataset import (
|
||||||
|
BucketingSampler,
|
||||||
|
CutMix,
|
||||||
|
DynamicBucketingSampler,
|
||||||
|
K2SpeechRecognitionDataset,
|
||||||
|
SpecAugment,
|
||||||
|
)
|
||||||
|
from lhotse.dataset.input_strategies import (
|
||||||
|
OnTheFlyFeatures,
|
||||||
|
PrecomputedFeatures,
|
||||||
|
)
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from lhotse import CutSet
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
class AsrDataset:
|
class AsrDataModule:
|
||||||
def __init__(self, args: argparse.Namespace):
|
def __init__(self, args: argparse.Namespace):
|
||||||
self.args = args
|
self.args = args
|
||||||
|
|
||||||
@ -55,19 +71,11 @@ class AsrDataset:
|
|||||||
"--num-buckets",
|
"--num-buckets",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=30,
|
||||||
help="The number of buckets for the BucketingSampler"
|
help="The number of buckets for the BucketingSampler "
|
||||||
|
"and DynamicBucketingSampler."
|
||||||
"(you might want to increase it for larger datasets).",
|
"(you might want to increase it for larger datasets).",
|
||||||
)
|
)
|
||||||
|
|
||||||
group.add_argument(
|
|
||||||
"--on-the-fly-feats",
|
|
||||||
type=str2bool,
|
|
||||||
default=False,
|
|
||||||
help="When enabled, use on-the-fly cut mixing and feature "
|
|
||||||
"extraction. Will drop existing precomputed feature manifests "
|
|
||||||
"if available.",
|
|
||||||
)
|
|
||||||
|
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"--shuffle",
|
"--shuffle",
|
||||||
type=str2bool,
|
type=str2bool,
|
||||||
@ -126,8 +134,25 @@ class AsrDataset:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def train_dataloaders(
|
def train_dataloaders(
|
||||||
self, cuts_train: CutSet, cuts_musan: Optional[CutSet] = None
|
self,
|
||||||
|
cuts_train: CutSet,
|
||||||
|
dynamic_bucketing: bool,
|
||||||
|
on_the_fly_feats: bool,
|
||||||
|
cuts_musan: Optional[CutSet] = None,
|
||||||
) -> DataLoader:
|
) -> DataLoader:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
cuts_train:
|
||||||
|
Cuts for training.
|
||||||
|
cuts_musan:
|
||||||
|
If not None, it is the cuts for mixing.
|
||||||
|
dynamic_bucketing:
|
||||||
|
True to use DynamicBucketingSampler;
|
||||||
|
False to use BucketingSampler.
|
||||||
|
on_the_fly_feats:
|
||||||
|
True to use OnTheFlyFeatures;
|
||||||
|
False to use PrecomputedFeatures.
|
||||||
|
"""
|
||||||
transforms = []
|
transforms = []
|
||||||
if cuts_musan is not None:
|
if cuts_musan is not None:
|
||||||
logging.info("Enable MUSAN")
|
logging.info("Enable MUSAN")
|
||||||
@ -177,21 +202,34 @@ class AsrDataset:
|
|||||||
# Drop feats to be on the safe side.
|
# Drop feats to be on the safe side.
|
||||||
train = K2SpeechRecognitionDataset(
|
train = K2SpeechRecognitionDataset(
|
||||||
cut_transforms=transforms,
|
cut_transforms=transforms,
|
||||||
input_strategy=OnTheFlyFeatures(
|
input_strategy=(
|
||||||
Fbank(FbankConfig(num_mel_bins=80))
|
OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
|
||||||
|
if on_the_fly_feats
|
||||||
|
else PrecomputedFeatures()
|
||||||
),
|
),
|
||||||
input_transforms=input_transforms,
|
input_transforms=input_transforms,
|
||||||
return_cuts=self.args.return_cuts,
|
return_cuts=self.args.return_cuts,
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Using DynamicBucketingSampler.")
|
if dynamic_bucketing:
|
||||||
train_sampler = DynamicBucketingSampler(
|
logging.info("Using DynamicBucketingSampler.")
|
||||||
cuts_train,
|
train_sampler = DynamicBucketingSampler(
|
||||||
max_duration=self.args.max_duration,
|
cuts_train,
|
||||||
shuffle=self.args.shuffle,
|
max_duration=self.args.max_duration,
|
||||||
num_buckets=self.args.num_buckets,
|
shuffle=self.args.shuffle,
|
||||||
drop_last=True,
|
num_buckets=self.args.num_buckets,
|
||||||
)
|
drop_last=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.info("Using BucketingSampler.")
|
||||||
|
train_sampler = BucketingSampler(
|
||||||
|
cuts_train,
|
||||||
|
max_duration=self.args.max_duration,
|
||||||
|
shuffle=self.args.shuffle,
|
||||||
|
num_buckets=self.args.num_buckets,
|
||||||
|
bucket_method="equal_duration",
|
||||||
|
drop_last=True,
|
||||||
|
)
|
||||||
|
|
||||||
logging.info("About to create train dataloader")
|
logging.info("About to create train dataloader")
|
||||||
train_dl = DataLoader(
|
train_dl = DataLoader(
|
||||||
@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from lhotse import CutSet, load_manifest
|
from lhotse import CutSet, load_manifest
|
||||||
|
|
||||||
@ -29,29 +29,47 @@ class GigaSpeech:
|
|||||||
manifest_dir:
|
manifest_dir:
|
||||||
It is expected to contain the following files::
|
It is expected to contain the following files::
|
||||||
|
|
||||||
- cuts_L.jsonl.gz
|
- cuts_XL_raw.jsonl.gz
|
||||||
- cuts_XL.jsonl.gz
|
- cuts_L_raw.jsonl.gz
|
||||||
- cuts_TEST.jsonl.gz
|
- cuts_M_raw.jsonl.gz
|
||||||
- cuts_DEV.jsonl.gz
|
- cuts_S_raw.jsonl.gz
|
||||||
|
- cuts_XS_raw.jsonl.gz
|
||||||
|
- cuts_DEV_raw.jsonl.gz
|
||||||
|
- cuts_TEST_raw.jsonl.gz
|
||||||
"""
|
"""
|
||||||
self.manifest_dir = Path(manifest_dir)
|
self.manifest_dir = Path(manifest_dir)
|
||||||
|
|
||||||
def train_L_cuts(self) -> CutSet:
|
|
||||||
f = self.manifest_dir / "cuts_L.json.gz"
|
|
||||||
logging.info(f"About to get train-L cuts from {f}")
|
|
||||||
return CutSet.from_jsonl_lazy(f)
|
|
||||||
|
|
||||||
def train_XL_cuts(self) -> CutSet:
|
def train_XL_cuts(self) -> CutSet:
|
||||||
f = self.manifest_dir / "cuts_XL.json.gz"
|
f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
|
||||||
logging.info(f"About to get train-XL cuts from {f}")
|
logging.info(f"About to get train-XL cuts from {f}")
|
||||||
return CutSet.from_jsonl_lazy(f)
|
return CutSet.from_jsonl_lazy(f)
|
||||||
|
|
||||||
|
def train_L_cuts(self) -> CutSet:
|
||||||
|
f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
|
||||||
|
logging.info(f"About to get train-L cuts from {f}")
|
||||||
|
return CutSet.from_jsonl_lazy(f)
|
||||||
|
|
||||||
|
def train_M_cuts(self) -> CutSet:
|
||||||
|
f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
|
||||||
|
logging.info(f"About to get train-M cuts from {f}")
|
||||||
|
return CutSet.from_jsonl_lazy(f)
|
||||||
|
|
||||||
|
def train_S_cuts(self) -> CutSet:
|
||||||
|
f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
|
||||||
|
logging.info(f"About to get train-S cuts from {f}")
|
||||||
|
return CutSet.from_jsonl_lazy(f)
|
||||||
|
|
||||||
|
def train_XS_cuts(self) -> CutSet:
|
||||||
|
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
|
||||||
|
logging.info(f"About to get train-XS cuts from {f}")
|
||||||
|
return CutSet.from_jsonl_lazy(f)
|
||||||
|
|
||||||
def test_cuts(self) -> CutSet:
|
def test_cuts(self) -> CutSet:
|
||||||
f = self.manifest_dir / "cuts_TEST.json.gz"
|
f = self.manifest_dir / "cuts_TEST.jsonl.gz"
|
||||||
logging.info(f"About to get TEST cuts from {f}")
|
logging.info(f"About to get TEST cuts from {f}")
|
||||||
return load_manifest(f)
|
return load_manifest(f)
|
||||||
|
|
||||||
def dev_cuts(self) -> CutSet:
|
def dev_cuts(self) -> CutSet:
|
||||||
f = self.manifest_dir / "cuts_DEV.json.gz"
|
f = self.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||||
logging.info(f"About to get DEV cuts from {f}")
|
logging.info(f"About to get DEV cuts from {f}")
|
||||||
return load_manifest(f)
|
return load_manifest(f)
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from lhotse import CutSet, load_manifest
|
from lhotse import CutSet, load_manifest
|
||||||
|
|
||||||
|
|||||||
103
egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
Executable file
103
egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
Executable file
@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
To run this file, do:
|
||||||
|
|
||||||
|
cd icefall/egs/librispeech/ASR
|
||||||
|
python ./transducer_stateless_multi_datasets/test_asr_datamodule.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from asr_datamodule import AsrDataModule
|
||||||
|
from gigaspeech import GigaSpeech
|
||||||
|
from lhotse import load_manifest
|
||||||
|
from librispeech import LibriSpeech
|
||||||
|
|
||||||
|
|
||||||
|
def test_dataset():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
AsrDataModule.add_arguments(parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
if args.enable_musan:
|
||||||
|
cuts_musan = load_manifest(
|
||||||
|
Path(args.manifest_dir) / "cuts_musan.json.gz"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cuts_musan = None
|
||||||
|
|
||||||
|
librispeech = LibriSpeech(manifest_dir=args.manifest_dir)
|
||||||
|
gigaspeech = GigaSpeech(manifest_dir=args.manifest_dir)
|
||||||
|
|
||||||
|
train_clean_100 = librispeech.train_clean_100_cuts()
|
||||||
|
train_S = gigaspeech.train_S_cuts()
|
||||||
|
|
||||||
|
asr_datamodule = AsrDataModule(args)
|
||||||
|
|
||||||
|
libri_train_dl = asr_datamodule.train_dataloaders(
|
||||||
|
train_clean_100,
|
||||||
|
dynamic_bucketing=False,
|
||||||
|
on_the_fly_feats=False,
|
||||||
|
cuts_musan=cuts_musan,
|
||||||
|
)
|
||||||
|
|
||||||
|
giga_train_dl = asr_datamodule.train_dataloaders(
|
||||||
|
train_S,
|
||||||
|
dynamic_bucketing=True,
|
||||||
|
on_the_fly_feats=True,
|
||||||
|
cuts_musan=cuts_musan,
|
||||||
|
)
|
||||||
|
|
||||||
|
seed = 20220216
|
||||||
|
rng = random.Random(seed)
|
||||||
|
|
||||||
|
for epoch in range(2):
|
||||||
|
print("epoch", epoch)
|
||||||
|
batch_idx = 0
|
||||||
|
libri_train_dl.sampler.set_epoch(epoch)
|
||||||
|
giga_train_dl.sampler.set_epoch(epoch)
|
||||||
|
|
||||||
|
iter_libri = iter(libri_train_dl)
|
||||||
|
iter_giga = iter(giga_train_dl)
|
||||||
|
while True:
|
||||||
|
idx = rng.choices((0, 1), weights=[0.8, 0.2], k=1)[0]
|
||||||
|
dl = iter_libri if idx == 0 else iter_giga
|
||||||
|
batch_idx += 1
|
||||||
|
|
||||||
|
print("dl idx", idx, "batch_idx", batch_idx)
|
||||||
|
batch = next(dl)
|
||||||
|
cuts = batch["supervisions"]["cut"]
|
||||||
|
for c in cuts:
|
||||||
|
print(c.id)
|
||||||
|
|
||||||
|
if batch_idx > 10:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
test_dataset()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user