Minor fixes after review.

2025-12-08 21:45:27 +00:00 · 2021-07-20 10:02:20 +08:00 · 2021-07-20 10:02:20 +08:00 · e005ea062c
commit e005ea062c
parent f25eedf2d4
8 changed files with 67 additions and 76 deletions
--- a/.flake8
+++ b/.flake8
@ -1,2 +1,8 @@
 [flake8]
+show-source=true
+statistics=true
 max-line-length = 80
+
+exclude =
+  .git,
+  **/data/**
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
 data
+__pycache__
+path.sh
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
 """

 import os
-import subprocess
-from contextlib import contextmanager
 from pathlib import Path

-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
 from lhotse.recipes.utils import read_manifests_if_cached

-
-@contextmanager
-def get_executor():
-    # We'll either return a process pool or a distributed worker pool.
-    # Note that this has to be a context manager because we might use multiple
-    # context manager ("with" clauses) inside, and this way everything will
-    # free up the resources at the right time.
-    try:
-        # If this is executed on the CLSP grid, we will try to use the
-        # Grid Engine to distribute the tasks.
-        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
-        # (see https://github.com/pzelasko/plz for reference)
-        #
-        # The following must be installed:
-        # $ pip install dask distributed
-        # $ pip install git+https://github.com/pzelasko/plz
-        name = subprocess.check_output("hostname -f", shell=True, text=True)
-        if name.strip().endswith(".clsp.jhu.edu"):
-            import plz
-            from distributed import Client
-
-            with plz.setup_cluster() as cluster:
-                cluster.scale(80)
-                yield Client(cluster)
-            return
-    except:
-        pass
-    # No need to return anything - compute_and_store_features
-    # will just instantiate the pool itself.
-    yield None
+from icefall.utils import get_executor


 def compute_fbank_librispeech():
@ -75,7 +44,8 @@ def compute_fbank_librispeech():
                continue
            print("Processing", partition)
            cut_set = CutSet.from_manifests(
-                recordings=m["recordings"], supervisions=m["supervisions"],
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
            )
            if "train" in partition:
                cut_set = (
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
 """

 import os
-import subprocess
-from contextlib import contextmanager
 from pathlib import Path

 from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
 from lhotse.recipes.utils import read_manifests_if_cached

-
-@contextmanager
-def get_executor():
-    # We'll either return a process pool or a distributed worker pool.
-    # Note that this has to be a context manager because we might use multiple
-    # context manager ("with" clauses) inside, and this way everything will
-    # free up the resources at the right time.
-    try:
-        # If this is executed on the CLSP grid, we will try to use the
-        # Grid Engine to distribute the tasks.
-        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
-        # (see https://github.com/pzelasko/plz for reference)
-        #
-        # The following must be installed:
-        # $ pip install dask distributed
-        # $ pip install git+https://github.com/pzelasko/plz
-        name = subprocess.check_output("hostname -f", shell=True, text=True)
-        if name.strip().endswith(".clsp.jhu.edu"):
-            import plz
-            from distributed import Client
-
-            with plz.setup_cluster() as cluster:
-                cluster.scale(80)
-                yield Client(cluster)
-            return
-    except:
-        pass
-    # No need to return anything - compute_and_store_features
-    # will just instantiate the pool itself.
-    yield None
+from icefall.utils import get_executor


 def compute_fbank_musan():
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@ -31,6 +31,8 @@ def download_lm():
            urlretrieve_progress(
                f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
            )
+        else:
+            print(f'{filename} already exists - skipping')

        if ".gz" in str(filename):
            unzip_file = Path(os.path.splitext(filename)[0])
@ -38,6 +40,8 @@ def download_lm():
                with gzip.open(filename, "rb") as f_in:
                    with open(unzip_file, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
+            else:
+                print(f'{unzip_file} already exist - skipping')


 if __name__ == "__main__":
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  echo "stage 0: Download data"

  # If you have pre-downloaded it to /path/to/LibriSpeech,
-  # you can create a symlink to avoid downloading it again:
+  # you can create a symlink
  #
  #   ln -sfv /path/to/LibriSpeech data/
  #
+  # The script checks that if
+  #
+  #   data/LibriSpeech/test-clean/.completed exists,
+  #
+  # it will not re-download it.
+  #
+  # The same goes for dev-clean, dev-other, test-other, train-clean-100
+  # train-clean-360, and train-other-500

  mkdir -p data/LibriSpeech
-
-  if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
-    # It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
-    lhotse download librispeech --full data
-  fi
+  lhotse download librispeech --full data

  # If you have pre-downloaded it to /path/to/musan,
-  # you can create a symlink to avoid downloading it again:
+  # you can create a symlink
  #
-  #   ln -s /path/to/musan data/
+  #   ln -sfv /path/to/musan data/
  #
-  if [ ! -f data/musan/.musan_completed ]; then
+  # and create a file data/.musan_completed
+  # to avoid downloading it again
+  if [ ! -f data/.musan_completed ]; then
    lhotse download musan data
  fi
 fi
@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
 fi

 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-  echo "Stage 4: Compute fbank for librispeech"
+  echo "Stage 4: Compute fbank for musan"
  mkdir -p data/fbank
  ./local/compute_fbank_musan.py
 fi
--- a/icefall/init.py
+++ b/icefall/init.py
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -0,0 +1,34 @@
+import subprocess
+from contextlib import contextmanager
+
+
+@contextmanager
+def get_executor():
+    # We'll either return a process pool or a distributed worker pool.
+    # Note that this has to be a context manager because we might use multiple
+    # context manager ("with" clauses) inside, and this way everything will
+    # free up the resources at the right time.
+    try:
+        # If this is executed on the CLSP grid, we will try to use the
+        # Grid Engine to distribute the tasks.
+        # Other clusters can also benefit from that, provided a
+        # cluster-specific wrapper.
+        # (see https://github.com/pzelasko/plz for reference)
+        #
+        # The following must be installed:
+        # $ pip install dask distributed
+        # $ pip install git+https://github.com/pzelasko/plz
+        name = subprocess.check_output("hostname -f", shell=True, text=True)
+        if name.strip().endswith(".clsp.jhu.edu"):
+            import plz
+            from distributed import Client
+
+            with plz.setup_cluster() as cluster:
+                cluster.scale(80)
+                yield Client(cluster)
+            return
+    except Exception:
+        pass
+    # No need to return anything - compute_and_store_features
+    # will just instantiate the pool itself.
+    yield None