Minor fixes after review.

2025-12-11 06:55:27 +00:00 · 2021-07-20 10:02:20 +08:00 · 2021-07-20 10:02:20 +08:00 · e005ea062c
commit e005ea062c
parent f25eedf2d4
8 changed files with 67 additions and 76 deletions
--- a/.flake8
+++ b/.flake8
@ -1,2 +1,8 @@
 [flake8]
 show-source=true
 statistics=true
 max-line-length = 80
 exclude =
  .git,
  **/data/**
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
 data
 __pycache__
 path.sh
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
 """
 import os
 import subprocess
 from contextlib import contextmanager
 from pathlib import Path
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
 from lhotse.recipes.utils import read_manifests_if_cached
-
+from icefall.utils import get_executor
@contextmanager
 def get_executor():
    # We'll either return a process pool or a distributed worker pool.
    # Note that this has to be a context manager because we might use multiple
    # context manager ("with" clauses) inside, and this way everything will
    # free up the resources at the right time.
    try:
        # If this is executed on the CLSP grid, we will try to use the
        # Grid Engine to distribute the tasks.
        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
        # (see https://github.com/pzelasko/plz for reference)
        #
        # The following must be installed:
        # $ pip install dask distributed
        # $ pip install git+https://github.com/pzelasko/plz
        name = subprocess.check_output("hostname -f", shell=True, text=True)
        if name.strip().endswith(".clsp.jhu.edu"):
            import plz
            from distributed import Client
            with plz.setup_cluster() as cluster:
                cluster.scale(80)
                yield Client(cluster)
            return
    except:
        pass
    # No need to return anything - compute_and_store_features
    # will just instantiate the pool itself.
    yield None
 def compute_fbank_librispeech():
@ -75,7 +44,8 @@ def compute_fbank_librispeech():
                continue
            print("Processing", partition)
            cut_set = CutSet.from_manifests(
-                recordings=m["recordings"], supervisions=m["supervisions"],
+                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if "train" in partition:
                cut_set = (
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
 """
 import os
 import subprocess
 from contextlib import contextmanager
 from pathlib import Path
 from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
 from lhotse.recipes.utils import read_manifests_if_cached
-
+from icefall.utils import get_executor
@contextmanager
 def get_executor():
    # We'll either return a process pool or a distributed worker pool.
    # Note that this has to be a context manager because we might use multiple
    # context manager ("with" clauses) inside, and this way everything will
    # free up the resources at the right time.
    try:
        # If this is executed on the CLSP grid, we will try to use the
        # Grid Engine to distribute the tasks.
        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
        # (see https://github.com/pzelasko/plz for reference)
        #
        # The following must be installed:
        # $ pip install dask distributed
        # $ pip install git+https://github.com/pzelasko/plz
        name = subprocess.check_output("hostname -f", shell=True, text=True)
        if name.strip().endswith(".clsp.jhu.edu"):
            import plz
            from distributed import Client
            with plz.setup_cluster() as cluster:
                cluster.scale(80)
                yield Client(cluster)
            return
    except:
        pass
    # No need to return anything - compute_and_store_features
    # will just instantiate the pool itself.
    yield None
 def compute_fbank_musan():
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@ -31,6 +31,8 @@ def download_lm():
            urlretrieve_progress(
                f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
            )
        else:
            print(f'{filename} already exists - skipping')
        if ".gz" in str(filename):
            unzip_file = Path(os.path.splitext(filename)[0])
@ -38,6 +40,8 @@ def download_lm():
                with gzip.open(filename, "rb") as f_in:
                    with open(unzip_file, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
            else:
                print(f'{unzip_file} already exist - skipping')
 if __name__ == "__main__":
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  echo "stage 0: Download data"
  # If you have pre-downloaded it to /path/to/LibriSpeech,
-  # you can create a symlink to avoid downloading it again:
+  # you can create a symlink
  #
  #   ln -sfv /path/to/LibriSpeech data/
  #
  # The script checks that if
  #
  #   data/LibriSpeech/test-clean/.completed exists,
  #
  # it will not re-download it.
  #
  # The same goes for dev-clean, dev-other, test-other, train-clean-100
  # train-clean-360, and train-other-500
  mkdir -p data/LibriSpeech
-
+  lhotse download librispeech --full data
  if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
    # It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
    lhotse download librispeech --full data
  fi
  # If you have pre-downloaded it to /path/to/musan,
-  # you can create a symlink to avoid downloading it again:
+  # you can create a symlink
  #
-  #   ln -s /path/to/musan data/
+  #   ln -sfv /path/to/musan data/
  #
-  if [ ! -f data/musan/.musan_completed ]; then
+  # and create a file data/.musan_completed
  # to avoid downloading it again
  if [ ! -f data/.musan_completed ]; then
    lhotse download musan data
  fi
 fi
@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-  echo "Stage 4: Compute fbank for librispeech"
+  echo "Stage 4: Compute fbank for musan"
  mkdir -p data/fbank
  ./local/compute_fbank_musan.py
 fi
--- a/icefall/init.py
+++ b/icefall/init.py
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -0,0 +1,34 @@
 import subprocess
 from contextlib import contextmanager
@contextmanager
 def get_executor():
    # We'll either return a process pool or a distributed worker pool.
    # Note that this has to be a context manager because we might use multiple
    # context manager ("with" clauses) inside, and this way everything will
    # free up the resources at the right time.
    try:
        # If this is executed on the CLSP grid, we will try to use the
        # Grid Engine to distribute the tasks.
        # Other clusters can also benefit from that, provided a
        # cluster-specific wrapper.
        # (see https://github.com/pzelasko/plz for reference)
        #
        # The following must be installed:
        # $ pip install dask distributed
        # $ pip install git+https://github.com/pzelasko/plz
        name = subprocess.check_output("hostname -f", shell=True, text=True)
        if name.strip().endswith(".clsp.jhu.edu"):
            import plz
            from distributed import Client
            with plz.setup_cluster() as cluster:
                cluster.scale(80)
                yield Client(cluster)
            return
    except Exception:
        pass
    # No need to return anything - compute_and_store_features
    # will just instantiate the pool itself.
    yield None