mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 17:42:21 +00:00
Minor fixes after review.
This commit is contained in:
parent
f25eedf2d4
commit
e005ea062c
6
.flake8
6
.flake8
@ -1,2 +1,8 @@
|
||||
[flake8]
|
||||
show-source=true
|
||||
statistics=true
|
||||
max-line-length = 80
|
||||
|
||||
exclude =
|
||||
.git,
|
||||
**/data/**
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1,3 @@
|
||||
data
|
||||
__pycache__
|
||||
path.sh
|
||||
|
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_executor():
|
||||
# We'll either return a process pool or a distributed worker pool.
|
||||
# Note that this has to be a context manager because we might use multiple
|
||||
# context manager ("with" clauses) inside, and this way everything will
|
||||
# free up the resources at the right time.
|
||||
try:
|
||||
# If this is executed on the CLSP grid, we will try to use the
|
||||
# Grid Engine to distribute the tasks.
|
||||
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
|
||||
# (see https://github.com/pzelasko/plz for reference)
|
||||
#
|
||||
# The following must be installed:
|
||||
# $ pip install dask distributed
|
||||
# $ pip install git+https://github.com/pzelasko/plz
|
||||
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
||||
if name.strip().endswith(".clsp.jhu.edu"):
|
||||
import plz
|
||||
from distributed import Client
|
||||
|
||||
with plz.setup_cluster() as cluster:
|
||||
cluster.scale(80)
|
||||
yield Client(cluster)
|
||||
return
|
||||
except:
|
||||
pass
|
||||
# No need to return anything - compute_and_store_features
|
||||
# will just instantiate the pool itself.
|
||||
yield None
|
||||
from icefall.utils import get_executor
|
||||
|
||||
|
||||
def compute_fbank_librispeech():
|
||||
@ -75,7 +44,8 @@ def compute_fbank_librispeech():
|
||||
continue
|
||||
print("Processing", partition)
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=m["recordings"], supervisions=m["supervisions"],
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
if "train" in partition:
|
||||
cut_set = (
|
||||
|
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_executor():
|
||||
# We'll either return a process pool or a distributed worker pool.
|
||||
# Note that this has to be a context manager because we might use multiple
|
||||
# context manager ("with" clauses) inside, and this way everything will
|
||||
# free up the resources at the right time.
|
||||
try:
|
||||
# If this is executed on the CLSP grid, we will try to use the
|
||||
# Grid Engine to distribute the tasks.
|
||||
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
|
||||
# (see https://github.com/pzelasko/plz for reference)
|
||||
#
|
||||
# The following must be installed:
|
||||
# $ pip install dask distributed
|
||||
# $ pip install git+https://github.com/pzelasko/plz
|
||||
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
||||
if name.strip().endswith(".clsp.jhu.edu"):
|
||||
import plz
|
||||
from distributed import Client
|
||||
|
||||
with plz.setup_cluster() as cluster:
|
||||
cluster.scale(80)
|
||||
yield Client(cluster)
|
||||
return
|
||||
except:
|
||||
pass
|
||||
# No need to return anything - compute_and_store_features
|
||||
# will just instantiate the pool itself.
|
||||
yield None
|
||||
from icefall.utils import get_executor
|
||||
|
||||
|
||||
def compute_fbank_musan():
|
||||
|
@ -31,6 +31,8 @@ def download_lm():
|
||||
urlretrieve_progress(
|
||||
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
|
||||
)
|
||||
else:
|
||||
print(f'{filename} already exists - skipping')
|
||||
|
||||
if ".gz" in str(filename):
|
||||
unzip_file = Path(os.path.splitext(filename)[0])
|
||||
@ -38,6 +40,8 @@ def download_lm():
|
||||
with gzip.open(filename, "rb") as f_in:
|
||||
with open(unzip_file, "wb") as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
else:
|
||||
print(f'{unzip_file} already exist - skipping')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
echo "stage 0: Download data"
|
||||
|
||||
# If you have pre-downloaded it to /path/to/LibriSpeech,
|
||||
# you can create a symlink to avoid downloading it again:
|
||||
# you can create a symlink
|
||||
#
|
||||
# ln -sfv /path/to/LibriSpeech data/
|
||||
#
|
||||
# The script checks that if
|
||||
#
|
||||
# data/LibriSpeech/test-clean/.completed exists,
|
||||
#
|
||||
# it will not re-download it.
|
||||
#
|
||||
# The same goes for dev-clean, dev-other, test-other, train-clean-100
|
||||
# train-clean-360, and train-other-500
|
||||
|
||||
mkdir -p data/LibriSpeech
|
||||
|
||||
if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
|
||||
# It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
|
||||
lhotse download librispeech --full data
|
||||
fi
|
||||
lhotse download librispeech --full data
|
||||
|
||||
# If you have pre-downloaded it to /path/to/musan,
|
||||
# you can create a symlink to avoid downloading it again:
|
||||
# you can create a symlink
|
||||
#
|
||||
# ln -s /path/to/musan data/
|
||||
# ln -sfv /path/to/musan data/
|
||||
#
|
||||
if [ ! -f data/musan/.musan_completed ]; then
|
||||
# and create a file data/.musan_completed
|
||||
# to avoid downloading it again
|
||||
if [ ! -f data/.musan_completed ]; then
|
||||
lhotse download musan data
|
||||
fi
|
||||
fi
|
||||
@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
echo "Stage 4: Compute fbank for librispeech"
|
||||
echo "Stage 4: Compute fbank for musan"
|
||||
mkdir -p data/fbank
|
||||
./local/compute_fbank_musan.py
|
||||
fi
|
||||
|
0
icefall/__init__.py
Normal file
0
icefall/__init__.py
Normal file
34
icefall/utils.py
Normal file
34
icefall/utils.py
Normal file
@ -0,0 +1,34 @@
|
||||
import subprocess
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_executor():
|
||||
# We'll either return a process pool or a distributed worker pool.
|
||||
# Note that this has to be a context manager because we might use multiple
|
||||
# context manager ("with" clauses) inside, and this way everything will
|
||||
# free up the resources at the right time.
|
||||
try:
|
||||
# If this is executed on the CLSP grid, we will try to use the
|
||||
# Grid Engine to distribute the tasks.
|
||||
# Other clusters can also benefit from that, provided a
|
||||
# cluster-specific wrapper.
|
||||
# (see https://github.com/pzelasko/plz for reference)
|
||||
#
|
||||
# The following must be installed:
|
||||
# $ pip install dask distributed
|
||||
# $ pip install git+https://github.com/pzelasko/plz
|
||||
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
||||
if name.strip().endswith(".clsp.jhu.edu"):
|
||||
import plz
|
||||
from distributed import Client
|
||||
|
||||
with plz.setup_cluster() as cluster:
|
||||
cluster.scale(80)
|
||||
yield Client(cluster)
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
# No need to return anything - compute_and_store_features
|
||||
# will just instantiate the pool itself.
|
||||
yield None
|
Loading…
x
Reference in New Issue
Block a user