Compute features of librispeech and musan.

2025-12-08 21:45:27 +00:00 · 2021-07-19 23:35:32 +08:00 · 2021-07-19 23:35:32 +08:00 · 0b19aa09c1
commit 0b19aa09c1
parent 40eed74460
8 changed files with 322 additions and 7 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,16 +3,19 @@ repos:
    rev: 21.6b0
    hooks:
      - id: black
+        args: [--line-length=80]

  - repo: https://github.com/PyCQA/flake8
    rev: 3.9.2
    hooks:
      - id: flake8
+        args: [--max-line-length=80]

  - repo: https://github.com/pycqa/isort
    rev: 5.9.2
    hooks:
      - id: isort
+        args: [--profile=black]

  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.0.1
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+"""
+This file computes fbank features of the librispeech dataset.
+Its looks for manifests in the directory data/manifests
+and generated fbank features are saved in data/fbank.
+"""
+
+import os
+import subprocess
+from contextlib import contextmanager
+from pathlib import Path
+
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
+from lhotse.recipes.utils import read_manifests_if_cached
+
+
+@contextmanager
+def get_executor():
+    # We'll either return a process pool or a distributed worker pool.
+    # Note that this has to be a context manager because we might use multiple
+    # context manager ("with" clauses) inside, and this way everything will
+    # free up the resources at the right time.
+    try:
+        # If this is executed on the CLSP grid, we will try to use the
+        # Grid Engine to distribute the tasks.
+        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
+        # (see https://github.com/pzelasko/plz for reference)
+        #
+        # The following must be installed:
+        # $ pip install dask distributed
+        # $ pip install git+https://github.com/pzelasko/plz
+        name = subprocess.check_output("hostname -f", shell=True, text=True)
+        if name.strip().endswith(".clsp.jhu.edu"):
+            import plz
+            from distributed import Client
+
+            with plz.setup_cluster() as cluster:
+                cluster.scale(80)
+                yield Client(cluster)
+            return
+    except:
+        pass
+    # No need to return anything - compute_and_store_features
+    # will just instantiate the pool itself.
+    yield None
+
+
+def compute_fbank_librispeech():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_jobs = min(15, os.cpu_count())
+    num_mel_bins = 80
+
+    dataset_parts = (
+        "dev-clean",
+        "dev-other",
+        "test-clean",
+        "test-other",
+        "train-clean-100",
+        "train-clean-360",
+        "train-other-500",
+    )
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts, output_dir=src_dir
+    )
+    assert manifests is not None
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        for partition, m in manifests.items():
+            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+                print(f"{partition} already exists - skipping.")
+                continue
+            print("Processing", partition)
+            cut_set = CutSet.from_manifests(
+                recordings=m["recordings"], supervisions=m["supervisions"],
+            )
+            if "train" in partition:
+                cut_set = (
+                    cut_set
+                    + cut_set.perturb_speed(0.9)
+                    + cut_set.perturb_speed(1.1)
+                )
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomHdf5Writer,
+            )
+            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+
+
+if __name__ == "__main__":
+    compute_fbank_librispeech()
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+"""
+This file computes fbank features of the musan dataset.
+Its looks for manifests in the directory data/manifests
+and generated fbank features are saved in data/fbank.
+"""
+
+import os
+import subprocess
+from contextlib import contextmanager
+from pathlib import Path
+
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
+from lhotse.recipes.utils import read_manifests_if_cached
+
+
+@contextmanager
+def get_executor():
+    # We'll either return a process pool or a distributed worker pool.
+    # Note that this has to be a context manager because we might use multiple
+    # context manager ("with" clauses) inside, and this way everything will
+    # free up the resources at the right time.
+    try:
+        # If this is executed on the CLSP grid, we will try to use the
+        # Grid Engine to distribute the tasks.
+        # Other clusters can also benefit from that, provided a cluster-specific wrapper.
+        # (see https://github.com/pzelasko/plz for reference)
+        #
+        # The following must be installed:
+        # $ pip install dask distributed
+        # $ pip install git+https://github.com/pzelasko/plz
+        name = subprocess.check_output("hostname -f", shell=True, text=True)
+        if name.strip().endswith(".clsp.jhu.edu"):
+            import plz
+            from distributed import Client
+
+            with plz.setup_cluster() as cluster:
+                cluster.scale(80)
+                yield Client(cluster)
+            return
+    except:
+        pass
+    # No need to return anything - compute_and_store_features
+    # will just instantiate the pool itself.
+    yield None
+
+
+def compute_fbank_musan():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_jobs = min(15, os.cpu_count())
+    num_mel_bins = 80
+
+    dataset_parts = (
+        "music",
+        "speech",
+        "noise",
+    )
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts, output_dir=src_dir
+    )
+    assert manifests is not None
+
+    musan_cuts_path = output_dir / "cuts_musan.json.gz"
+
+    if musan_cuts_path.is_file():
+        print(f"{musan_cuts_path} already exists - skipping")
+        return
+
+    print("Extracting features for Musan")
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        # create chunks of Musan with duration 5 - 10 seconds
+        musan_cuts = (
+            CutSet.from_manifests(
+                recordings=combine(
+                    part["recordings"] for part in manifests.values()
+                )
+            )
+            .cut_into_windows(10.0)
+            .filter(lambda c: c.duration > 5)
+            .compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/feats_musan",
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomHdf5Writer,
+            )
+        )
+        musan_cuts.to_json(musan_cuts_path)
+
+
+if __name__ == "__main__":
+    compute_fbank_musan()
--- a/egs/librispeech/ASR/local/download_data.py
+++ b/egs/librispeech/ASR/local/download_data.py
@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+"""
+This file downloads the librispeech dataset
+to the directory data/LibriSpeech.
+
+It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh .
+"""
+
+
+from lhotse.recipes import download_librispeech
+
+
+def download_data():
+    target_dir = "data"
+
+    download_librispeech(target_dir=target_dir, dataset_parts="librispeech")
+
+
+if __name__ == "__main__":
+    download_data()
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@ -1,6 +1,9 @@
 #!/usr/bin/env python3

 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+"""
+This file downloads librispeech LM files to data/lm
+"""

 import gzip
 import os
@ -26,9 +29,7 @@ def download_lm():
        filename = target_dir / f
        if filename.is_file() is False:
            urlretrieve_progress(
-                f"{url}/{f}",
-                filename=filename,
-                desc=f"Downloading {filename}",
+                f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
            )

        if ".gz" in str(filename):
--- a/egs/librispeech/ASR/local/prepare_librispeech_manifest.py
+++ b/egs/librispeech/ASR/local/prepare_librispeech_manifest.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+"""
+This file generates manifests for the librispeech dataset.
+It expects the dataset is saved in data/LibriSpeech
+and the generated manifests are saved in data/manifests.
+"""
+
+import os
+from pathlib import Path
+
+from lhotse.recipes import prepare_librispeech
+
+
+def prepare_librispeech_mainfest():
+    corpus_dir = Path("data/LibriSpeech")
+    output_dir = Path("data/manifests")
+    num_jobs = min(15, os.cpu_count())
+
+    librispeech_manifests = prepare_librispeech(
+        corpus_dir=corpus_dir,
+        dataset_parts="auto",
+        output_dir=output_dir,
+        num_jobs=num_jobs,
+    )
+
+
+if __name__ == "__main__":
+    prepare_librispeech_mainfest()
--- a/egs/librispeech/ASR/local/prepare_musan_manifest.py
+++ b/egs/librispeech/ASR/local/prepare_musan_manifest.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+"""
+This file generates manifests for the musan dataset.
+It expects the dataset is saved in data/musan
+and the generated manifests are saved in data/manifests.
+"""
+
+from pathlib import Path
+
+from lhotse.recipes import prepare_musan
+
+
+def prepare_musan_mainfest():
+    corpus_dir = Path("data/musan")
+    output_dir = Path("data/manifests")
+
+    prepare_musan(corpus_dir=corpus_dir, output_dir=output_dir)
+
+
+if __name__ == "__main__":
+    prepare_musan_mainfest()
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -1,6 +1,5 @@
 #!/usr/bin/env bash

-
 set -eou pipefail

 stage=-1
@ -19,8 +18,53 @@ fi
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  echo "stage 0: Download data"

-  # If you have pre-downloaded it in /path/to/LibriSpeech
-  # Just run: ln -sfv /path/to/LibriSpeech data/
+  # If you have pre-downloaded it to /path/to/LibriSpeech,
+  # you can create a symlink to avoid downloading it again:
+  #
+  #   ln -sfv /path/to/LibriSpeech data/
+  #
+
  mkdir -p data/LibriSpeech
-  # TODO
+
+  if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
+    # It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
+    ./local/download_data.py
+  fi
+
+  # If you have pre-downloaded it to /path/to/musan,
+  # you can create a symlink to avoid downloading it again:
+  #
+  #   ln -s /path/to/musan data/
+  #
+  if [ ! -e data/musan ]; then
+    wget https://www.openslr.org/resources/17/musan.tar.gz
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  echo "Stage 1: Prepare librispeech manifest"
+  # We assume that you have downloaded the librispeech corpus
+  # to data/LibriSpeech
+  mkdir -p data/manifests
+  ./local/prepare_librispeech_manifest.py
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  echo "Stage 2: Prepare musan manifest"
+  # We assume that you have downloaded the musan corpus
+  # to data/musan
+  mkdir -p data/manifests
+  ./local/prepare_musan_manifest.py
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  echo "Stage 3: Compute fbank for librispeech"
+  mkdir -p data/fbank
+  ./local/compute_fbank_librispeech.py
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  echo "Stage 4: Compute fbank for librispeech"
+  mkdir -p data/fbank
+  ./local/compute_fbank_musan.py
 fi