From e005ea062c2c9f79b51ccec269a1a62c2b07f5c9 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Tue, 20 Jul 2021 10:02:20 +0800
Subject: [PATCH] Minor fixes after review.

---
 .flake8                                       |  6 +++
 .gitignore                                    |  2 +
 .../ASR/local/compute_fbank_librispeech.py    | 38 ++-----------------
 .../ASR/local/compute_fbank_musan.py          | 33 +---------------
 egs/librispeech/ASR/local/download_lm.py      |  4 ++
 egs/librispeech/ASR/prepare.sh                | 26 ++++++++-----
 icefall/__init__.py                           |  0
 icefall/utils.py                              | 34 +++++++++++++++++
 8 files changed, 67 insertions(+), 76 deletions(-)
 create mode 100644 icefall/__init__.py
 create mode 100644 icefall/utils.py

diff --git a/.flake8 b/.flake8
index 15fc7e33e..090e97971 100644
--- a/.flake8
+++ b/.flake8
@@ -1,2 +1,8 @@
 [flake8]
+show-source=true
+statistics=true
 max-line-length = 80
+
+exclude =
+  .git,
+  **/data/**
diff --git a/.gitignore b/.gitignore
index 1269488f7..6c8274c5c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 data
+__pycache__
+path.sh
diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
index 0c55f7241..947d9f8d9 100755
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
 """
 
 import os
-import subprocess
-from contextlib import contextmanager
 from pathlib import Path
 
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
 from lhotse.recipes.utils import read_manifests_if_cached
 
-
-@contextmanager
-def get_executor():
-    # We'll either return a process pool or a distributed worker pool.
-    # Note that this has to be a context manager because we might use multiple
-    # context manager ("with" clauses) inside, and this way everything will
-    # free up the resources at the right time.
-    try:
-        # If this is executed on the CLSP grid, we will try to use the
-        # Grid Engine to distribute the tasks.
-        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
-        # (see https://github.com/pzelasko/plz for reference)
-        #
-        # The following must be installed:
-        # $ pip install dask distributed
-        # $ pip install git+https://github.com/pzelasko/plz
-        name = subprocess.check_output("hostname -f", shell=True, text=True)
-        if name.strip().endswith(".clsp.jhu.edu"):
-            import plz
-            from distributed import Client
-
-            with plz.setup_cluster() as cluster:
-                cluster.scale(80)
-                yield Client(cluster)
-            return
-    except:
-        pass
-    # No need to return anything - compute_and_store_features
-    # will just instantiate the pool itself.
-    yield None
+from icefall.utils import get_executor
 
 
 def compute_fbank_librispeech():
@@ -75,7 +44,8 @@ def compute_fbank_librispeech():
                 continue
             print("Processing", partition)
             cut_set = CutSet.from_manifests(
-                recordings=m["recordings"], supervisions=m["supervisions"],
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
             )
             if "train" in partition:
                 cut_set = (
diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py
index 41b19c656..d63131da8 100755
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@@ -7,43 +7,12 @@ and generated fbank features are saved in data/fbank.
 """
 
 import os
-import subprocess
-from contextlib import contextmanager
 from pathlib import Path
 
 from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
 from lhotse.recipes.utils import read_manifests_if_cached
 
-
-@contextmanager
-def get_executor():
-    # We'll either return a process pool or a distributed worker pool.
-    # Note that this has to be a context manager because we might use multiple
-    # context manager ("with" clauses) inside, and this way everything will
-    # free up the resources at the right time.
-    try:
-        # If this is executed on the CLSP grid, we will try to use the
-        # Grid Engine to distribute the tasks.
-        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
-        # (see https://github.com/pzelasko/plz for reference)
-        #
-        # The following must be installed:
-        # $ pip install dask distributed
-        # $ pip install git+https://github.com/pzelasko/plz
-        name = subprocess.check_output("hostname -f", shell=True, text=True)
-        if name.strip().endswith(".clsp.jhu.edu"):
-            import plz
-            from distributed import Client
-
-            with plz.setup_cluster() as cluster:
-                cluster.scale(80)
-                yield Client(cluster)
-            return
-    except:
-        pass
-    # No need to return anything - compute_and_store_features
-    # will just instantiate the pool itself.
-    yield None
+from icefall.utils import get_executor
 
 
 def compute_fbank_musan():
diff --git a/egs/librispeech/ASR/local/download_lm.py b/egs/librispeech/ASR/local/download_lm.py
index 7df864680..47251a5a0 100755
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@@ -31,6 +31,8 @@ def download_lm():
             urlretrieve_progress(
                 f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
             )
+        else:
+            print(f'{filename} already exists - skipping')
 
         if ".gz" in str(filename):
             unzip_file = Path(os.path.splitext(filename)[0])
@@ -38,6 +40,8 @@ def download_lm():
                 with gzip.open(filename, "rb") as f_in:
                     with open(unzip_file, "wb") as f_out:
                         shutil.copyfileobj(f_in, f_out)
+            else:
+                print(f'{unzip_file} already exist - skipping')
 
 
 if __name__ == "__main__":
diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
index 1602b9203..f0b10b226 100755
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@@ -20,24 +20,30 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
   echo "stage 0: Download data"
 
   # If you have pre-downloaded it to /path/to/LibriSpeech,
-  # you can create a symlink to avoid downloading it again:
+  # you can create a symlink
   #
   #   ln -sfv /path/to/LibriSpeech data/
   #
+  # The script checks that if
+  #
+  #   data/LibriSpeech/test-clean/.completed exists,
+  #
+  # it will not re-download it.
+  #
+  # The same goes for dev-clean, dev-other, test-other, train-clean-100
+  # train-clean-360, and train-other-500
 
   mkdir -p data/LibriSpeech
-
-  if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
-    # It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
-    lhotse download librispeech --full data
-  fi
+  lhotse download librispeech --full data
 
   # If you have pre-downloaded it to /path/to/musan,
-  # you can create a symlink to avoid downloading it again:
+  # you can create a symlink
   #
-  #   ln -s /path/to/musan data/
+  #   ln -sfv /path/to/musan data/
   #
-  if [ ! -f data/musan/.musan_completed ]; then
+  # and create a file data/.musan_completed
+  # to avoid downloading it again
+  if [ ! -f data/.musan_completed ]; then
     lhotse download musan data
   fi
 fi
@@ -65,7 +71,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
 fi
 
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-  echo "Stage 4: Compute fbank for librispeech"
+  echo "Stage 4: Compute fbank for musan"
   mkdir -p data/fbank
   ./local/compute_fbank_musan.py
 fi
diff --git a/icefall/__init__.py b/icefall/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/icefall/utils.py b/icefall/utils.py
new file mode 100644
index 000000000..cc2513863
--- /dev/null
+++ b/icefall/utils.py
@@ -0,0 +1,34 @@
+import subprocess
+from contextlib import contextmanager
+
+
+@contextmanager
+def get_executor():
+    # We'll either return a process pool or a distributed worker pool.
+    # Note that this has to be a context manager because we might use multiple
+    # context manager ("with" clauses) inside, and this way everything will
+    # free up the resources at the right time.
+    try:
+        # If this is executed on the CLSP grid, we will try to use the
+        # Grid Engine to distribute the tasks.
+        # Other clusters can also benefit from that, provided a
+        # cluster-specific wrapper.
+        # (see https://github.com/pzelasko/plz for reference)
+        #
+        # The following must be installed:
+        # $ pip install dask distributed
+        # $ pip install git+https://github.com/pzelasko/plz
+        name = subprocess.check_output("hostname -f", shell=True, text=True)
+        if name.strip().endswith(".clsp.jhu.edu"):
+            import plz
+            from distributed import Client
+
+            with plz.setup_cluster() as cluster:
+                cluster.scale(80)
+                yield Client(cluster)
+            return
+    except Exception:
+        pass
+    # No need to return anything - compute_and_store_features
+    # will just instantiate the pool itself.
+    yield None