Minor fixes after review.

This commit is contained in:
Fangjun Kuang 2021-07-20 10:02:20 +08:00
parent f25eedf2d4
commit e005ea062c
8 changed files with 67 additions and 76 deletions

View File

@ -1,2 +1,8 @@
[flake8]
show-source=true
statistics=true
max-line-length = 80
exclude =
.git,
**/data/**

2
.gitignore vendored
View File

@ -1 +1,3 @@
data
__pycache__
path.sh

View File

@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
"""
import os
import subprocess
from contextlib import contextmanager
from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
from lhotse.recipes.utils import read_manifests_if_cached
@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client
with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None
from icefall.utils import get_executor
def compute_fbank_librispeech():
@ -75,7 +44,8 @@ def compute_fbank_librispeech():
continue
print("Processing", partition)
cut_set = CutSet.from_manifests(
recordings=m["recordings"], supervisions=m["supervisions"],
recordings=m["recordings"],
supervisions=m["supervisions"],
)
if "train" in partition:
cut_set = (

View File

@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
"""
import os
import subprocess
from contextlib import contextmanager
from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
from lhotse.recipes.utils import read_manifests_if_cached
@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client
with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None
from icefall.utils import get_executor
def compute_fbank_musan():

View File

@ -31,6 +31,8 @@ def download_lm():
urlretrieve_progress(
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
)
else:
print(f'{filename} already exists - skipping')
if ".gz" in str(filename):
unzip_file = Path(os.path.splitext(filename)[0])
@ -38,6 +40,8 @@ def download_lm():
with gzip.open(filename, "rb") as f_in:
with open(unzip_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
else:
print(f'{unzip_file} already exist - skipping')
if __name__ == "__main__":

View File

@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
echo "stage 0: Download data"
# If you have pre-downloaded it to /path/to/LibriSpeech,
# you can create a symlink to avoid downloading it again:
# you can create a symlink
#
# ln -sfv /path/to/LibriSpeech data/
#
# The script checks that if
#
# data/LibriSpeech/test-clean/.completed exists,
#
# it will not re-download it.
#
# The same goes for dev-clean, dev-other, test-other, train-clean-100
# train-clean-360, and train-other-500
mkdir -p data/LibriSpeech
if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
# It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
lhotse download librispeech --full data
fi
lhotse download librispeech --full data
# If you have pre-downloaded it to /path/to/musan,
# you can create a symlink to avoid downloading it again:
# you can create a symlink
#
# ln -s /path/to/musan data/
# ln -sfv /path/to/musan data/
#
if [ ! -f data/musan/.musan_completed ]; then
# and create a file data/.musan_completed
# to avoid downloading it again
if [ ! -f data/.musan_completed ]; then
lhotse download musan data
fi
fi
@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
echo "Stage 4: Compute fbank for librispeech"
echo "Stage 4: Compute fbank for musan"
mkdir -p data/fbank
./local/compute_fbank_musan.py
fi

0
icefall/__init__.py Normal file
View File

34
icefall/utils.py Normal file
View File

@ -0,0 +1,34 @@
import subprocess
from contextlib import contextmanager
@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a
# cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client
with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except Exception:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None