From e005ea062c2c9f79b51ccec269a1a62c2b07f5c9 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 20 Jul 2021 10:02:20 +0800 Subject: [PATCH] Minor fixes after review. --- .flake8 | 6 +++ .gitignore | 2 + .../ASR/local/compute_fbank_librispeech.py | 38 ++----------------- .../ASR/local/compute_fbank_musan.py | 33 +--------------- egs/librispeech/ASR/local/download_lm.py | 4 ++ egs/librispeech/ASR/prepare.sh | 26 ++++++++----- icefall/__init__.py | 0 icefall/utils.py | 34 +++++++++++++++++ 8 files changed, 67 insertions(+), 76 deletions(-) create mode 100644 icefall/__init__.py create mode 100644 icefall/utils.py diff --git a/.flake8 b/.flake8 index 15fc7e33e..090e97971 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,8 @@ [flake8] +show-source=true +statistics=true max-line-length = 80 + +exclude = + .git, + **/data/** diff --git a/.gitignore b/.gitignore index 1269488f7..6c8274c5c 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ data +__pycache__ +path.sh diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py index 0c55f7241..947d9f8d9 100755 --- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py +++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py @@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank. """ import os -import subprocess -from contextlib import contextmanager from pathlib import Path -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine +from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer from lhotse.recipes.utils import read_manifests_if_cached - -@contextmanager -def get_executor(): - # We'll either return a process pool or a distributed worker pool. - # Note that this has to be a context manager because we might use multiple - # context manager ("with" clauses) inside, and this way everything will - # free up the resources at the right time. - try: - # If this is executed on the CLSP grid, we will try to use the - # Grid Engine to distribute the tasks. - # Other clusters can also benefit from that, provided a cluster-specific wrapper. - # (see https://github.com/pzelasko/plz for reference) - # - # The following must be installed: - # $ pip install dask distributed - # $ pip install git+https://github.com/pzelasko/plz - name = subprocess.check_output("hostname -f", shell=True, text=True) - if name.strip().endswith(".clsp.jhu.edu"): - import plz - from distributed import Client - - with plz.setup_cluster() as cluster: - cluster.scale(80) - yield Client(cluster) - return - except: - pass - # No need to return anything - compute_and_store_features - # will just instantiate the pool itself. - yield None +from icefall.utils import get_executor def compute_fbank_librispeech(): @@ -75,7 +44,8 @@ def compute_fbank_librispeech(): continue print("Processing", partition) cut_set = CutSet.from_manifests( - recordings=m["recordings"], supervisions=m["supervisions"], + recordings=m["recordings"], + supervisions=m["supervisions"], ) if "train" in partition: cut_set = ( diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py index 41b19c656..d63131da8 100755 --- a/egs/librispeech/ASR/local/compute_fbank_musan.py +++ b/egs/librispeech/ASR/local/compute_fbank_musan.py @@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank. """ import os -import subprocess -from contextlib import contextmanager from pathlib import Path from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine from lhotse.recipes.utils import read_manifests_if_cached - -@contextmanager -def get_executor(): - # We'll either return a process pool or a distributed worker pool. - # Note that this has to be a context manager because we might use multiple - # context manager ("with" clauses) inside, and this way everything will - # free up the resources at the right time. - try: - # If this is executed on the CLSP grid, we will try to use the - # Grid Engine to distribute the tasks. - # Other clusters can also benefit from that, provided a cluster-specific wrapper. - # (see https://github.com/pzelasko/plz for reference) - # - # The following must be installed: - # $ pip install dask distributed - # $ pip install git+https://github.com/pzelasko/plz - name = subprocess.check_output("hostname -f", shell=True, text=True) - if name.strip().endswith(".clsp.jhu.edu"): - import plz - from distributed import Client - - with plz.setup_cluster() as cluster: - cluster.scale(80) - yield Client(cluster) - return - except: - pass - # No need to return anything - compute_and_store_features - # will just instantiate the pool itself. - yield None +from icefall.utils import get_executor def compute_fbank_musan(): diff --git a/egs/librispeech/ASR/local/download_lm.py b/egs/librispeech/ASR/local/download_lm.py index 7df864680..47251a5a0 100755 --- a/egs/librispeech/ASR/local/download_lm.py +++ b/egs/librispeech/ASR/local/download_lm.py @@ -31,6 +31,8 @@ def download_lm(): urlretrieve_progress( f"{url}/{f}", filename=filename, desc=f"Downloading {filename}", ) + else: + print(f'{filename} already exists - skipping') if ".gz" in str(filename): unzip_file = Path(os.path.splitext(filename)[0]) @@ -38,6 +40,8 @@ def download_lm(): with gzip.open(filename, "rb") as f_in: with open(unzip_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) + else: + print(f'{unzip_file} already exist - skipping') if __name__ == "__main__": diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 1602b9203..f0b10b226 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then echo "stage 0: Download data" # If you have pre-downloaded it to /path/to/LibriSpeech, - # you can create a symlink to avoid downloading it again: + # you can create a symlink # # ln -sfv /path/to/LibriSpeech data/ # + # The script checks that if + # + # data/LibriSpeech/test-clean/.completed exists, + # + # it will not re-download it. + # + # The same goes for dev-clean, dev-other, test-other, train-clean-100 + # train-clean-360, and train-other-500 mkdir -p data/LibriSpeech - - if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then - # It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh - lhotse download librispeech --full data - fi + lhotse download librispeech --full data # If you have pre-downloaded it to /path/to/musan, - # you can create a symlink to avoid downloading it again: + # you can create a symlink # - # ln -s /path/to/musan data/ + # ln -sfv /path/to/musan data/ # - if [ ! -f data/musan/.musan_completed ]; then + # and create a file data/.musan_completed + # to avoid downloading it again + if [ ! -f data/.musan_completed ]; then lhotse download musan data fi fi @@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - echo "Stage 4: Compute fbank for librispeech" + echo "Stage 4: Compute fbank for musan" mkdir -p data/fbank ./local/compute_fbank_musan.py fi diff --git a/icefall/__init__.py b/icefall/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/icefall/utils.py b/icefall/utils.py new file mode 100644 index 000000000..cc2513863 --- /dev/null +++ b/icefall/utils.py @@ -0,0 +1,34 @@ +import subprocess +from contextlib import contextmanager + + +@contextmanager +def get_executor(): + # We'll either return a process pool or a distributed worker pool. + # Note that this has to be a context manager because we might use multiple + # context manager ("with" clauses) inside, and this way everything will + # free up the resources at the right time. + try: + # If this is executed on the CLSP grid, we will try to use the + # Grid Engine to distribute the tasks. + # Other clusters can also benefit from that, provided a + # cluster-specific wrapper. + # (see https://github.com/pzelasko/plz for reference) + # + # The following must be installed: + # $ pip install dask distributed + # $ pip install git+https://github.com/pzelasko/plz + name = subprocess.check_output("hostname -f", shell=True, text=True) + if name.strip().endswith(".clsp.jhu.edu"): + import plz + from distributed import Client + + with plz.setup_cluster() as cluster: + cluster.scale(80) + yield Client(cluster) + return + except Exception: + pass + # No need to return anything - compute_and_store_features + # will just instantiate the pool itself. + yield None