mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Compute features of librispeech and musan.
This commit is contained in:
parent
40eed74460
commit
0b19aa09c1
@ -3,16 +3,19 @@ repos:
|
|||||||
rev: 21.6b0
|
rev: 21.6b0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
|
args: [--line-length=80]
|
||||||
|
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 3.9.2
|
rev: 3.9.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
|
args: [--max-line-length=80]
|
||||||
|
|
||||||
- repo: https://github.com/pycqa/isort
|
- repo: https://github.com/pycqa/isort
|
||||||
rev: 5.9.2
|
rev: 5.9.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
|
args: [--profile=black]
|
||||||
|
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.0.1
|
rev: v4.0.1
|
||||||
|
98
egs/librispeech/ASR/local/compute_fbank_librispeech.py
Executable file
98
egs/librispeech/ASR/local/compute_fbank_librispeech.py
Executable file
@ -0,0 +1,98 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file computes fbank features of the librispeech dataset.
|
||||||
|
Its looks for manifests in the directory data/manifests
|
||||||
|
and generated fbank features are saved in data/fbank.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
|
||||||
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def get_executor():
|
||||||
|
# We'll either return a process pool or a distributed worker pool.
|
||||||
|
# Note that this has to be a context manager because we might use multiple
|
||||||
|
# context manager ("with" clauses) inside, and this way everything will
|
||||||
|
# free up the resources at the right time.
|
||||||
|
try:
|
||||||
|
# If this is executed on the CLSP grid, we will try to use the
|
||||||
|
# Grid Engine to distribute the tasks.
|
||||||
|
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
|
||||||
|
# (see https://github.com/pzelasko/plz for reference)
|
||||||
|
#
|
||||||
|
# The following must be installed:
|
||||||
|
# $ pip install dask distributed
|
||||||
|
# $ pip install git+https://github.com/pzelasko/plz
|
||||||
|
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
||||||
|
if name.strip().endswith(".clsp.jhu.edu"):
|
||||||
|
import plz
|
||||||
|
from distributed import Client
|
||||||
|
|
||||||
|
with plz.setup_cluster() as cluster:
|
||||||
|
cluster.scale(80)
|
||||||
|
yield Client(cluster)
|
||||||
|
return
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# No need to return anything - compute_and_store_features
|
||||||
|
# will just instantiate the pool itself.
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_librispeech():
|
||||||
|
src_dir = Path("data/manifests")
|
||||||
|
output_dir = Path("data/fbank")
|
||||||
|
num_jobs = min(15, os.cpu_count())
|
||||||
|
num_mel_bins = 80
|
||||||
|
|
||||||
|
dataset_parts = (
|
||||||
|
"dev-clean",
|
||||||
|
"dev-other",
|
||||||
|
"test-clean",
|
||||||
|
"test-other",
|
||||||
|
"train-clean-100",
|
||||||
|
"train-clean-360",
|
||||||
|
"train-other-500",
|
||||||
|
)
|
||||||
|
manifests = read_manifests_if_cached(
|
||||||
|
dataset_parts=dataset_parts, output_dir=src_dir
|
||||||
|
)
|
||||||
|
assert manifests is not None
|
||||||
|
|
||||||
|
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
||||||
|
|
||||||
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
|
for partition, m in manifests.items():
|
||||||
|
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||||
|
print(f"{partition} already exists - skipping.")
|
||||||
|
continue
|
||||||
|
print("Processing", partition)
|
||||||
|
cut_set = CutSet.from_manifests(
|
||||||
|
recordings=m["recordings"], supervisions=m["supervisions"],
|
||||||
|
)
|
||||||
|
if "train" in partition:
|
||||||
|
cut_set = (
|
||||||
|
cut_set
|
||||||
|
+ cut_set.perturb_speed(0.9)
|
||||||
|
+ cut_set.perturb_speed(1.1)
|
||||||
|
)
|
||||||
|
cut_set = cut_set.compute_and_store_features(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{output_dir}/feats_{partition}",
|
||||||
|
# when an executor is specified, make more partitions
|
||||||
|
num_jobs=num_jobs if ex is None else 80,
|
||||||
|
executor=ex,
|
||||||
|
storage_type=LilcomHdf5Writer,
|
||||||
|
)
|
||||||
|
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
compute_fbank_librispeech()
|
97
egs/librispeech/ASR/local/compute_fbank_musan.py
Executable file
97
egs/librispeech/ASR/local/compute_fbank_musan.py
Executable file
@ -0,0 +1,97 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file computes fbank features of the musan dataset.
|
||||||
|
Its looks for manifests in the directory data/manifests
|
||||||
|
and generated fbank features are saved in data/fbank.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
|
||||||
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def get_executor():
|
||||||
|
# We'll either return a process pool or a distributed worker pool.
|
||||||
|
# Note that this has to be a context manager because we might use multiple
|
||||||
|
# context manager ("with" clauses) inside, and this way everything will
|
||||||
|
# free up the resources at the right time.
|
||||||
|
try:
|
||||||
|
# If this is executed on the CLSP grid, we will try to use the
|
||||||
|
# Grid Engine to distribute the tasks.
|
||||||
|
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
|
||||||
|
# (see https://github.com/pzelasko/plz for reference)
|
||||||
|
#
|
||||||
|
# The following must be installed:
|
||||||
|
# $ pip install dask distributed
|
||||||
|
# $ pip install git+https://github.com/pzelasko/plz
|
||||||
|
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
||||||
|
if name.strip().endswith(".clsp.jhu.edu"):
|
||||||
|
import plz
|
||||||
|
from distributed import Client
|
||||||
|
|
||||||
|
with plz.setup_cluster() as cluster:
|
||||||
|
cluster.scale(80)
|
||||||
|
yield Client(cluster)
|
||||||
|
return
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# No need to return anything - compute_and_store_features
|
||||||
|
# will just instantiate the pool itself.
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_musan():
|
||||||
|
src_dir = Path("data/manifests")
|
||||||
|
output_dir = Path("data/fbank")
|
||||||
|
num_jobs = min(15, os.cpu_count())
|
||||||
|
num_mel_bins = 80
|
||||||
|
|
||||||
|
dataset_parts = (
|
||||||
|
"music",
|
||||||
|
"speech",
|
||||||
|
"noise",
|
||||||
|
)
|
||||||
|
manifests = read_manifests_if_cached(
|
||||||
|
dataset_parts=dataset_parts, output_dir=src_dir
|
||||||
|
)
|
||||||
|
assert manifests is not None
|
||||||
|
|
||||||
|
musan_cuts_path = output_dir / "cuts_musan.json.gz"
|
||||||
|
|
||||||
|
if musan_cuts_path.is_file():
|
||||||
|
print(f"{musan_cuts_path} already exists - skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("Extracting features for Musan")
|
||||||
|
|
||||||
|
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
||||||
|
|
||||||
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
|
# create chunks of Musan with duration 5 - 10 seconds
|
||||||
|
musan_cuts = (
|
||||||
|
CutSet.from_manifests(
|
||||||
|
recordings=combine(
|
||||||
|
part["recordings"] for part in manifests.values()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.cut_into_windows(10.0)
|
||||||
|
.filter(lambda c: c.duration > 5)
|
||||||
|
.compute_and_store_features(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{output_dir}/feats_musan",
|
||||||
|
num_jobs=num_jobs if ex is None else 80,
|
||||||
|
executor=ex,
|
||||||
|
storage_type=LilcomHdf5Writer,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
musan_cuts.to_json(musan_cuts_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
compute_fbank_musan()
|
21
egs/librispeech/ASR/local/download_data.py
Executable file
21
egs/librispeech/ASR/local/download_data.py
Executable file
@ -0,0 +1,21 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file downloads the librispeech dataset
|
||||||
|
to the directory data/LibriSpeech.
|
||||||
|
|
||||||
|
It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh .
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from lhotse.recipes import download_librispeech
|
||||||
|
|
||||||
|
|
||||||
|
def download_data():
|
||||||
|
target_dir = "data"
|
||||||
|
|
||||||
|
download_librispeech(target_dir=target_dir, dataset_parts="librispeech")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
download_data()
|
@ -1,6 +1,9 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
|
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
"""
|
||||||
|
This file downloads librispeech LM files to data/lm
|
||||||
|
"""
|
||||||
|
|
||||||
import gzip
|
import gzip
|
||||||
import os
|
import os
|
||||||
@ -26,9 +29,7 @@ def download_lm():
|
|||||||
filename = target_dir / f
|
filename = target_dir / f
|
||||||
if filename.is_file() is False:
|
if filename.is_file() is False:
|
||||||
urlretrieve_progress(
|
urlretrieve_progress(
|
||||||
f"{url}/{f}",
|
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
|
||||||
filename=filename,
|
|
||||||
desc=f"Downloading {filename}",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if ".gz" in str(filename):
|
if ".gz" in str(filename):
|
||||||
|
29
egs/librispeech/ASR/local/prepare_librispeech_manifest.py
Executable file
29
egs/librispeech/ASR/local/prepare_librispeech_manifest.py
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file generates manifests for the librispeech dataset.
|
||||||
|
It expects the dataset is saved in data/LibriSpeech
|
||||||
|
and the generated manifests are saved in data/manifests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lhotse.recipes import prepare_librispeech
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_librispeech_mainfest():
|
||||||
|
corpus_dir = Path("data/LibriSpeech")
|
||||||
|
output_dir = Path("data/manifests")
|
||||||
|
num_jobs = min(15, os.cpu_count())
|
||||||
|
|
||||||
|
librispeech_manifests = prepare_librispeech(
|
||||||
|
corpus_dir=corpus_dir,
|
||||||
|
dataset_parts="auto",
|
||||||
|
output_dir=output_dir,
|
||||||
|
num_jobs=num_jobs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
prepare_librispeech_mainfest()
|
22
egs/librispeech/ASR/local/prepare_musan_manifest.py
Executable file
22
egs/librispeech/ASR/local/prepare_musan_manifest.py
Executable file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file generates manifests for the musan dataset.
|
||||||
|
It expects the dataset is saved in data/musan
|
||||||
|
and the generated manifests are saved in data/manifests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lhotse.recipes import prepare_musan
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_musan_mainfest():
|
||||||
|
corpus_dir = Path("data/musan")
|
||||||
|
output_dir = Path("data/manifests")
|
||||||
|
|
||||||
|
prepare_musan(corpus_dir=corpus_dir, output_dir=output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
prepare_musan_mainfest()
|
@ -1,6 +1,5 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
|
||||||
set -eou pipefail
|
set -eou pipefail
|
||||||
|
|
||||||
stage=-1
|
stage=-1
|
||||||
@ -19,8 +18,53 @@ fi
|
|||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
echo "stage 0: Download data"
|
echo "stage 0: Download data"
|
||||||
|
|
||||||
# If you have pre-downloaded it in /path/to/LibriSpeech
|
# If you have pre-downloaded it to /path/to/LibriSpeech,
|
||||||
# Just run: ln -sfv /path/to/LibriSpeech data/
|
# you can create a symlink to avoid downloading it again:
|
||||||
|
#
|
||||||
|
# ln -sfv /path/to/LibriSpeech data/
|
||||||
|
#
|
||||||
|
|
||||||
mkdir -p data/LibriSpeech
|
mkdir -p data/LibriSpeech
|
||||||
# TODO
|
|
||||||
|
if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
|
||||||
|
# It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
|
||||||
|
./local/download_data.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If you have pre-downloaded it to /path/to/musan,
|
||||||
|
# you can create a symlink to avoid downloading it again:
|
||||||
|
#
|
||||||
|
# ln -s /path/to/musan data/
|
||||||
|
#
|
||||||
|
if [ ! -e data/musan ]; then
|
||||||
|
wget https://www.openslr.org/resources/17/musan.tar.gz
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
echo "Stage 1: Prepare librispeech manifest"
|
||||||
|
# We assume that you have downloaded the librispeech corpus
|
||||||
|
# to data/LibriSpeech
|
||||||
|
mkdir -p data/manifests
|
||||||
|
./local/prepare_librispeech_manifest.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
|
echo "Stage 2: Prepare musan manifest"
|
||||||
|
# We assume that you have downloaded the musan corpus
|
||||||
|
# to data/musan
|
||||||
|
mkdir -p data/manifests
|
||||||
|
./local/prepare_musan_manifest.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
|
echo "Stage 3: Compute fbank for librispeech"
|
||||||
|
mkdir -p data/fbank
|
||||||
|
./local/compute_fbank_librispeech.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
|
echo "Stage 4: Compute fbank for librispeech"
|
||||||
|
mkdir -p data/fbank
|
||||||
|
./local/compute_fbank_musan.py
|
||||||
fi
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user