data preparation

2023-03-16 12:28:59 +08:00 · 2023-03-16 12:28:59 +08:00 · b55ae4fd53
commit b55ae4fd53
parent 25873de7b6
4 changed files with 330 additions and 0 deletions
--- a/egs/himia/wuw/local/compute_fbank_aishell.py
+++ b/egs/himia/wuw/local/compute_fbank_aishell.py
@ -0,0 +1 @@
+../../../aishell/ASR/local/compute_fbank_aishell.py
--- a/egs/himia/wuw/local/compute_fbank_himia.py
+++ b/egs/himia/wuw/local/compute_fbank_himia.py
@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the HI_MIA and HI_MIA_CW dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse.recipes.utils import read_manifests_if_cached
+
+from icefall.utils import get_executor, str2bool
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--train-set-channel",
+        type=str,
+        default="_7_01",
+        help="""channel of HI_MIA dataset.
+        All channels are used if it is set "all".
+        """,
+    )
+
+    parser.add_argument(
+        "--enable-speed-perturb",
+        type=str2bool,
+        default=False,
+        help="""channel of trianing set.
+        """,
+    )
+    return parser.parse_args()
+
+
+def compute_fbank_himia(
+    train_set_channel: str = None,
+    enable_speed_perturb: bool = True,
+):
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_jobs = min(40, os.cpu_count())
+    num_mel_bins = 80
+
+    if "all" == train_set_channel:
+        dataset_parts = (
+            "train",
+            "dev",
+            "test",
+            "cw_test",
+        )
+    else:
+        dataset_parts = (
+            f"train{train_set_channel}",
+            f"dev{train_set_channel}",
+            f"test{train_set_channel}",
+            "cw_test",
+        )
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts, prefix="himia", output_dir=src_dir
+    )
+    assert manifests is not None
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        for partition, m in manifests.items():
+            if (output_dir / f"cuts_{partition}.jsonl.gz").is_file():
+                logging.info(f"{partition} already exists - skipping.")
+                continue
+            logging.info(f"Processing {partition}")
+            cut_set = CutSet.from_manifests(
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
+            )
+            if "train" in partition and enable_speed_perturb:
+                cut_set = (
+                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
+                )
+            cut_set = cut_set.resample(16000)
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomHdf5Writer,
+            )
+            output_file_name = f"cuts_{partition}.jsonl.gz"
+            if "all" != train_set_channel:
+                output_file_name = f"cuts_{partition}{train_set_channel}.jsonl.gz"
+
+            cut_set.to_file(output_dir / f"{output_file_name}")
+
+
+def main():
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    args = get_args()
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_himia(
+        train_set_channel=args.train_set_channel,
+        enable_speed_perturb=args.enable_speed_perturb,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/himia/wuw/local/compute_fbank_musan.py
+++ b/egs/himia/wuw/local/compute_fbank_musan.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/himia/wuw/prepare.sh
+++ b/egs/himia/wuw/prepare.sh
@ -0,0 +1,189 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+stage=6
+stop_stage=6
+
+# HI_MIA and aishell dataset are used in this experiment.
+# musan dataset is used for data augmentation.
+#
+# For aishell dataset downlading and preparation,
+# refer to icefall/egs/aishell/ASR/prepare.sh.
+#
+# For HI_MIA and HI_MIA_CW dataset,
+# we assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+# Then these files will be extracted to $dl_dir/HiMia/
+#
+#  - $dl_dir/train.tar.gz
+#      Himia training dataset.
+#      From https://www.openslr.org/85
+#
+#  - $dl_dir/dev.tar.gz
+#      Himia Devlopment dataset.
+#      From https://www.openslr.org/85
+#
+#  - $dl_dir/test_v2.tar.gz
+#      Himia test dataset.
+#      From https://www.openslr.org/85
+#
+#  - $dl_dir/data.tgz
+#      Himia confusion words(HI_MIA_CW) test dataset.
+#      From https://www.openslr.org/120
+
+#  - $dl_dir/resource.tgz
+#      Transcripts of (HI_MIA_CW) test dataset.
+#      From https://www.openslr.org/120
+
+dl_dir=$PWD/download
+train_set_channel=_7_01
+enable_speed_perturb=False
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded HI_MIA and HI_MIA_CW dataset to /path/to/himia/,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/himia $dl_dir/
+  #
+  if [ ! -f $dl_dir/train.tar.gz ]; then
+    lhotse download himia $dl_dir/
+  fi
+
+  # If you have pre-downloaded it to /path/to/musan,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/musan $dl_dir/
+  #
+  if [ ! -d $dl_dir/musan ]; then
+    lhotse download musan $dl_dir
+  fi
+
+  # If you have pre-downloaded it to /path/to/aishell,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/aishell $dl_dir/aishell
+  #
+  # The directory structure is
+  # aishell/
+  # |-- data_aishell
+  # |   |-- transcript
+  # |   `-- wav
+  # `-- resource_aishell
+  #     |-- lexicon.txt
+  #     `-- speaker.info
+
+  if [ ! -d $dl_dir/aishell/data_aishell/wav/train ]; then
+    lhotse download aishell $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare HI_MIA and HI_MIA_CWmanifest"
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.himia.done ]; then
+    lhotse prepare himia $dl_dir/HiMia data/manifests
+    touch data/manifests/.himia.done
+  fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare musan manifest"
+  # We assume that you have downloaded the musan corpus
+  # to data/musan
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.musan.done ]; then
+    lhotse prepare musan $dl_dir/musan data/manifests
+    touch data/manifests/.musan.done
+  fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare aishell manifest"
+  # We assume that you have downloaded the aishell corpus
+  # to $dl_dir/aishell
+  if [ ! -f data/manifests/.aishell_manifests.done ]; then
+    mkdir -p data/manifests
+    lhotse prepare aishell $dl_dir/aishell data/manifests
+    touch data/manifests/.aishell_manifests.done
+  fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Compute fbank for aishell"
+  if [ ! -f data/fbank/.aishell.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell.py \
+      --enable-speed-perturb=${enable_speed_perturb}
+    touch data/fbank/.aishell.done
+  fi
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Compute fbank for musan"
+  mkdir -p data/fbank
+  if [ ! -e data/fbank/.musan.done ]; then
+    ./local/compute_fbank_musan.py
+    touch data/fbank/.musan.done
+  fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Compute fbank for HI_MIA and HI_MIA_CW dataset"
+  # Format of train_set_channel is "micropohone position"_"channel"
+  # Microphone 1 to 6 is an array with 16 channels.
+  # Microphone 8 only has a single channel.
+  # So valid examples of train_set_channel could be:
+  # 1_01, ..., 1_16
+  # 2_01, ..., 2_16
+  # ...
+  # 6_01, ..., 6_16
+  # 7_01
+  train_set_channel="_7_01"
+  for subset in train dev test; do
+    for file_type in recordings supervisions; do
+      src=data/manifests/himia_${file_type}_${subset}.jsonl.gz
+      dst=data/manifests/himia_${file_type}_${subset}${train_set_channel}.jsonl.gz
+      cat <(gunzip -c ${src}) | \
+        grep ${train_set_channel} | \
+        gzip -c  > ${dst}
+    done
+  done
+
+  mkdir -p data/fbank
+  if [ ! -e data/fbank/.himia.done ]; then
+    ./local/compute_fbank_himia.py \
+      --train-set-channel=${train_set_channel} \
+      --enable-speed-perturb=${enable_speed_perturb}
+    touch data/fbank/.himia.done
+  fi
+
+  train_file=data/fbank/cuts_train_himia${train_set_channel}-aishell-shuf.jsonl.gz
+  if [ ! -f ${train_file} ]; then
+    # SingleCutSampler is prefered for this experiment.
+    # So `shuf` the training dataset here.
+    cat <(gunzip -c data/fbank/aishell_cuts_train.jsonl.gz) \
+      <(gunzip -c data/fbank/cuts_train${train_set_channel}.jsonl.gz) | \
+      grep -v _sp | \
+      shuf |shuf | gzip -c > ${train_file}
+  fi
+
+fi
+
				`@ -0,0 +1 @@`
				`../../../aishell/ASR/local/compute_fbank_aishell.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compute_fbank_musan.py`