mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Minor fixes after review.
This commit is contained in:
parent
f25eedf2d4
commit
e005ea062c
6
.flake8
6
.flake8
@ -1,2 +1,8 @@
|
|||||||
[flake8]
|
[flake8]
|
||||||
|
show-source=true
|
||||||
|
statistics=true
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
|
|
||||||
|
exclude =
|
||||||
|
.git,
|
||||||
|
**/data/**
|
||||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1,3 @@
|
|||||||
data
|
data
|
||||||
|
__pycache__
|
||||||
|
path.sh
|
||||||
|
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
|
from icefall.utils import get_executor
|
||||||
@contextmanager
|
|
||||||
def get_executor():
|
|
||||||
# We'll either return a process pool or a distributed worker pool.
|
|
||||||
# Note that this has to be a context manager because we might use multiple
|
|
||||||
# context manager ("with" clauses) inside, and this way everything will
|
|
||||||
# free up the resources at the right time.
|
|
||||||
try:
|
|
||||||
# If this is executed on the CLSP grid, we will try to use the
|
|
||||||
# Grid Engine to distribute the tasks.
|
|
||||||
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
|
|
||||||
# (see https://github.com/pzelasko/plz for reference)
|
|
||||||
#
|
|
||||||
# The following must be installed:
|
|
||||||
# $ pip install dask distributed
|
|
||||||
# $ pip install git+https://github.com/pzelasko/plz
|
|
||||||
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
|
||||||
if name.strip().endswith(".clsp.jhu.edu"):
|
|
||||||
import plz
|
|
||||||
from distributed import Client
|
|
||||||
|
|
||||||
with plz.setup_cluster() as cluster:
|
|
||||||
cluster.scale(80)
|
|
||||||
yield Client(cluster)
|
|
||||||
return
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
# No need to return anything - compute_and_store_features
|
|
||||||
# will just instantiate the pool itself.
|
|
||||||
yield None
|
|
||||||
|
|
||||||
|
|
||||||
def compute_fbank_librispeech():
|
def compute_fbank_librispeech():
|
||||||
@ -75,7 +44,8 @@ def compute_fbank_librispeech():
|
|||||||
continue
|
continue
|
||||||
print("Processing", partition)
|
print("Processing", partition)
|
||||||
cut_set = CutSet.from_manifests(
|
cut_set = CutSet.from_manifests(
|
||||||
recordings=m["recordings"], supervisions=m["supervisions"],
|
recordings=m["recordings"],
|
||||||
|
supervisions=m["supervisions"],
|
||||||
)
|
)
|
||||||
if "train" in partition:
|
if "train" in partition:
|
||||||
cut_set = (
|
cut_set = (
|
||||||
|
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
|
from icefall.utils import get_executor
|
||||||
@contextmanager
|
|
||||||
def get_executor():
|
|
||||||
# We'll either return a process pool or a distributed worker pool.
|
|
||||||
# Note that this has to be a context manager because we might use multiple
|
|
||||||
# context manager ("with" clauses) inside, and this way everything will
|
|
||||||
# free up the resources at the right time.
|
|
||||||
try:
|
|
||||||
# If this is executed on the CLSP grid, we will try to use the
|
|
||||||
# Grid Engine to distribute the tasks.
|
|
||||||
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
|
|
||||||
# (see https://github.com/pzelasko/plz for reference)
|
|
||||||
#
|
|
||||||
# The following must be installed:
|
|
||||||
# $ pip install dask distributed
|
|
||||||
# $ pip install git+https://github.com/pzelasko/plz
|
|
||||||
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
|
||||||
if name.strip().endswith(".clsp.jhu.edu"):
|
|
||||||
import plz
|
|
||||||
from distributed import Client
|
|
||||||
|
|
||||||
with plz.setup_cluster() as cluster:
|
|
||||||
cluster.scale(80)
|
|
||||||
yield Client(cluster)
|
|
||||||
return
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
# No need to return anything - compute_and_store_features
|
|
||||||
# will just instantiate the pool itself.
|
|
||||||
yield None
|
|
||||||
|
|
||||||
|
|
||||||
def compute_fbank_musan():
|
def compute_fbank_musan():
|
||||||
|
@ -31,6 +31,8 @@ def download_lm():
|
|||||||
urlretrieve_progress(
|
urlretrieve_progress(
|
||||||
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
|
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
print(f'{filename} already exists - skipping')
|
||||||
|
|
||||||
if ".gz" in str(filename):
|
if ".gz" in str(filename):
|
||||||
unzip_file = Path(os.path.splitext(filename)[0])
|
unzip_file = Path(os.path.splitext(filename)[0])
|
||||||
@ -38,6 +40,8 @@ def download_lm():
|
|||||||
with gzip.open(filename, "rb") as f_in:
|
with gzip.open(filename, "rb") as f_in:
|
||||||
with open(unzip_file, "wb") as f_out:
|
with open(unzip_file, "wb") as f_out:
|
||||||
shutil.copyfileobj(f_in, f_out)
|
shutil.copyfileobj(f_in, f_out)
|
||||||
|
else:
|
||||||
|
print(f'{unzip_file} already exist - skipping')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|||||||
echo "stage 0: Download data"
|
echo "stage 0: Download data"
|
||||||
|
|
||||||
# If you have pre-downloaded it to /path/to/LibriSpeech,
|
# If you have pre-downloaded it to /path/to/LibriSpeech,
|
||||||
# you can create a symlink to avoid downloading it again:
|
# you can create a symlink
|
||||||
#
|
#
|
||||||
# ln -sfv /path/to/LibriSpeech data/
|
# ln -sfv /path/to/LibriSpeech data/
|
||||||
#
|
#
|
||||||
|
# The script checks that if
|
||||||
|
#
|
||||||
|
# data/LibriSpeech/test-clean/.completed exists,
|
||||||
|
#
|
||||||
|
# it will not re-download it.
|
||||||
|
#
|
||||||
|
# The same goes for dev-clean, dev-other, test-other, train-clean-100
|
||||||
|
# train-clean-360, and train-other-500
|
||||||
|
|
||||||
mkdir -p data/LibriSpeech
|
mkdir -p data/LibriSpeech
|
||||||
|
lhotse download librispeech --full data
|
||||||
if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
|
|
||||||
# It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
|
|
||||||
lhotse download librispeech --full data
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If you have pre-downloaded it to /path/to/musan,
|
# If you have pre-downloaded it to /path/to/musan,
|
||||||
# you can create a symlink to avoid downloading it again:
|
# you can create a symlink
|
||||||
#
|
#
|
||||||
# ln -s /path/to/musan data/
|
# ln -sfv /path/to/musan data/
|
||||||
#
|
#
|
||||||
if [ ! -f data/musan/.musan_completed ]; then
|
# and create a file data/.musan_completed
|
||||||
|
# to avoid downloading it again
|
||||||
|
if [ ! -f data/.musan_completed ]; then
|
||||||
lhotse download musan data
|
lhotse download musan data
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
echo "Stage 4: Compute fbank for librispeech"
|
echo "Stage 4: Compute fbank for musan"
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
./local/compute_fbank_musan.py
|
./local/compute_fbank_musan.py
|
||||||
fi
|
fi
|
||||||
|
0
icefall/__init__.py
Normal file
0
icefall/__init__.py
Normal file
34
icefall/utils.py
Normal file
34
icefall/utils.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import subprocess
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def get_executor():
|
||||||
|
# We'll either return a process pool or a distributed worker pool.
|
||||||
|
# Note that this has to be a context manager because we might use multiple
|
||||||
|
# context manager ("with" clauses) inside, and this way everything will
|
||||||
|
# free up the resources at the right time.
|
||||||
|
try:
|
||||||
|
# If this is executed on the CLSP grid, we will try to use the
|
||||||
|
# Grid Engine to distribute the tasks.
|
||||||
|
# Other clusters can also benefit from that, provided a
|
||||||
|
# cluster-specific wrapper.
|
||||||
|
# (see https://github.com/pzelasko/plz for reference)
|
||||||
|
#
|
||||||
|
# The following must be installed:
|
||||||
|
# $ pip install dask distributed
|
||||||
|
# $ pip install git+https://github.com/pzelasko/plz
|
||||||
|
name = subprocess.check_output("hostname -f", shell=True, text=True)
|
||||||
|
if name.strip().endswith(".clsp.jhu.edu"):
|
||||||
|
import plz
|
||||||
|
from distributed import Client
|
||||||
|
|
||||||
|
with plz.setup_cluster() as cluster:
|
||||||
|
cluster.scale(80)
|
||||||
|
yield Client(cluster)
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# No need to return anything - compute_and_store_features
|
||||||
|
# will just instantiate the pool itself.
|
||||||
|
yield None
|
Loading…
x
Reference in New Issue
Block a user