Minor fixes after review.

This commit is contained in:
Fangjun Kuang 2021-07-20 10:02:20 +08:00
parent f25eedf2d4
commit e005ea062c
8 changed files with 67 additions and 76 deletions

View File

@ -1,2 +1,8 @@
[flake8] [flake8]
show-source=true
statistics=true
max-line-length = 80 max-line-length = 80
exclude =
.git,
**/data/**

2
.gitignore vendored
View File

@ -1 +1,3 @@
data data
__pycache__
path.sh

View File

@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
""" """
import os import os
import subprocess
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client
with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None
def compute_fbank_librispeech(): def compute_fbank_librispeech():
@ -75,7 +44,8 @@ def compute_fbank_librispeech():
continue continue
print("Processing", partition) print("Processing", partition)
cut_set = CutSet.from_manifests( cut_set = CutSet.from_manifests(
recordings=m["recordings"], supervisions=m["supervisions"], recordings=m["recordings"],
supervisions=m["supervisions"],
) )
if "train" in partition: if "train" in partition:
cut_set = ( cut_set = (

View File

@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
""" """
import os import os
import subprocess
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client
with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None
def compute_fbank_musan(): def compute_fbank_musan():

View File

@ -31,6 +31,8 @@ def download_lm():
urlretrieve_progress( urlretrieve_progress(
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}", f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
) )
else:
print(f'{filename} already exists - skipping')
if ".gz" in str(filename): if ".gz" in str(filename):
unzip_file = Path(os.path.splitext(filename)[0]) unzip_file = Path(os.path.splitext(filename)[0])
@ -38,6 +40,8 @@ def download_lm():
with gzip.open(filename, "rb") as f_in: with gzip.open(filename, "rb") as f_in:
with open(unzip_file, "wb") as f_out: with open(unzip_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out) shutil.copyfileobj(f_in, f_out)
else:
print(f'{unzip_file} already exist - skipping')
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
echo "stage 0: Download data" echo "stage 0: Download data"
# If you have pre-downloaded it to /path/to/LibriSpeech, # If you have pre-downloaded it to /path/to/LibriSpeech,
# you can create a symlink to avoid downloading it again: # you can create a symlink
# #
# ln -sfv /path/to/LibriSpeech data/ # ln -sfv /path/to/LibriSpeech data/
# #
# The script checks that if
#
# data/LibriSpeech/test-clean/.completed exists,
#
# it will not re-download it.
#
# The same goes for dev-clean, dev-other, test-other, train-clean-100
# train-clean-360, and train-other-500
mkdir -p data/LibriSpeech mkdir -p data/LibriSpeech
if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
# It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
lhotse download librispeech --full data lhotse download librispeech --full data
fi
# If you have pre-downloaded it to /path/to/musan, # If you have pre-downloaded it to /path/to/musan,
# you can create a symlink to avoid downloading it again: # you can create a symlink
# #
# ln -s /path/to/musan data/ # ln -sfv /path/to/musan data/
# #
if [ ! -f data/musan/.musan_completed ]; then # and create a file data/.musan_completed
# to avoid downloading it again
if [ ! -f data/.musan_completed ]; then
lhotse download musan data lhotse download musan data
fi fi
fi fi
@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
fi fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
echo "Stage 4: Compute fbank for librispeech" echo "Stage 4: Compute fbank for musan"
mkdir -p data/fbank mkdir -p data/fbank
./local/compute_fbank_musan.py ./local/compute_fbank_musan.py
fi fi

0
icefall/__init__.py Normal file
View File

34
icefall/utils.py Normal file
View File

@ -0,0 +1,34 @@
import subprocess
from contextlib import contextmanager
@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a
# cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client
with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except Exception:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None