diff --git a/egs/himia/wuw/local/compute_fbank_aishell.py b/egs/himia/wuw/local/compute_fbank_aishell.py new file mode 120000 index 000000000..f66261581 --- /dev/null +++ b/egs/himia/wuw/local/compute_fbank_aishell.py @@ -0,0 +1 @@ +../../../aishell/ASR/local/compute_fbank_aishell.py \ No newline at end of file diff --git a/egs/himia/wuw/local/compute_fbank_himia.py b/egs/himia/wuw/local/compute_fbank_himia.py new file mode 100755 index 000000000..f930a8c4e --- /dev/null +++ b/egs/himia/wuw/local/compute_fbank_himia.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# Copyright 2023 Xiaomi Corp. (authors: Liyong Guo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This file computes fbank features of the HI_MIA and HI_MIA_CW dataset. +It looks for manifests in the directory data/manifests. + +The generated fbank features are saved in data/fbank. +""" + +import argparse +import logging +import os +from pathlib import Path + +import torch +from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer +from lhotse.recipes.utils import read_manifests_if_cached + +from icefall.utils import get_executor, str2bool + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--train-set-channel", + type=str, + default="_7_01", + help="""channel of HI_MIA dataset. + All channels are used if it is set "all". + """, + ) + + parser.add_argument( + "--enable-speed-perturb", + type=str2bool, + default=False, + help="""channel of trianing set. + """, + ) + return parser.parse_args() + + +def compute_fbank_himia( + train_set_channel: str = None, + enable_speed_perturb: bool = True, +): + src_dir = Path("data/manifests") + output_dir = Path("data/fbank") + num_jobs = min(40, os.cpu_count()) + num_mel_bins = 80 + + if "all" == train_set_channel: + dataset_parts = ( + "train", + "dev", + "test", + "cw_test", + ) + else: + dataset_parts = ( + f"train{train_set_channel}", + f"dev{train_set_channel}", + f"test{train_set_channel}", + "cw_test", + ) + manifests = read_manifests_if_cached( + dataset_parts=dataset_parts, prefix="himia", output_dir=src_dir + ) + assert manifests is not None + + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + + with get_executor() as ex: # Initialize the executor only once. + for partition, m in manifests.items(): + if (output_dir / f"cuts_{partition}.jsonl.gz").is_file(): + logging.info(f"{partition} already exists - skipping.") + continue + logging.info(f"Processing {partition}") + cut_set = CutSet.from_manifests( + recordings=m["recordings"], + supervisions=m["supervisions"], + ) + if "train" in partition and enable_speed_perturb: + cut_set = ( + cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) + ) + cut_set = cut_set.resample(16000) + cut_set = cut_set.compute_and_store_features( + extractor=extractor, + storage_path=f"{output_dir}/feats_{partition}", + # when an executor is specified, make more partitions + num_jobs=num_jobs if ex is None else 80, + executor=ex, + storage_type=LilcomHdf5Writer, + ) + output_file_name = f"cuts_{partition}.jsonl.gz" + if "all" != train_set_channel: + output_file_name = f"cuts_{partition}{train_set_channel}.jsonl.gz" + + cut_set.to_file(output_dir / f"{output_file_name}") + + +def main(): + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + args = get_args() + logging.basicConfig(format=formatter, level=logging.INFO) + + compute_fbank_himia( + train_set_channel=args.train_set_channel, + enable_speed_perturb=args.enable_speed_perturb, + ) + + +if __name__ == "__main__": + main() diff --git a/egs/himia/wuw/local/compute_fbank_musan.py b/egs/himia/wuw/local/compute_fbank_musan.py new file mode 120000 index 000000000..5833f2484 --- /dev/null +++ b/egs/himia/wuw/local/compute_fbank_musan.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compute_fbank_musan.py \ No newline at end of file diff --git a/egs/himia/wuw/prepare.sh b/egs/himia/wuw/prepare.sh new file mode 100755 index 000000000..bb4f0f36c --- /dev/null +++ b/egs/himia/wuw/prepare.sh @@ -0,0 +1,189 @@ +#!/usr/bin/env bash + +set -eou pipefail + +stage=6 +stop_stage=6 + +# HI_MIA and aishell dataset are used in this experiment. +# musan dataset is used for data augmentation. +# +# For aishell dataset downlading and preparation, +# refer to icefall/egs/aishell/ASR/prepare.sh. +# +# For HI_MIA and HI_MIA_CW dataset, +# we assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# Then these files will be extracted to $dl_dir/HiMia/ +# +# - $dl_dir/train.tar.gz +# Himia training dataset. +# From https://www.openslr.org/85 +# +# - $dl_dir/dev.tar.gz +# Himia Devlopment dataset. +# From https://www.openslr.org/85 +# +# - $dl_dir/test_v2.tar.gz +# Himia test dataset. +# From https://www.openslr.org/85 +# +# - $dl_dir/data.tgz +# Himia confusion words(HI_MIA_CW) test dataset. +# From https://www.openslr.org/120 + +# - $dl_dir/resource.tgz +# Transcripts of (HI_MIA_CW) test dataset. +# From https://www.openslr.org/120 + +dl_dir=$PWD/download +train_set_channel=_7_01 +enable_speed_perturb=False + +. shared/parse_options.sh || exit 1 + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download data" + + # If you have pre-downloaded HI_MIA and HI_MIA_CW dataset to /path/to/himia/, + # you can create a symlink + # + # ln -sfv /path/to/himia $dl_dir/ + # + if [ ! -f $dl_dir/train.tar.gz ]; then + lhotse download himia $dl_dir/ + fi + + # If you have pre-downloaded it to /path/to/musan, + # you can create a symlink + # + # ln -sfv /path/to/musan $dl_dir/ + # + if [ ! -d $dl_dir/musan ]; then + lhotse download musan $dl_dir + fi + + # If you have pre-downloaded it to /path/to/aishell, + # you can create a symlink + # + # ln -sfv /path/to/aishell $dl_dir/aishell + # + # The directory structure is + # aishell/ + # |-- data_aishell + # | |-- transcript + # | `-- wav + # `-- resource_aishell + # |-- lexicon.txt + # `-- speaker.info + + if [ ! -d $dl_dir/aishell/data_aishell/wav/train ]; then + lhotse download aishell $dl_dir + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare HI_MIA and HI_MIA_CWmanifest" + mkdir -p data/manifests + if [ ! -e data/manifests/.himia.done ]; then + lhotse prepare himia $dl_dir/HiMia data/manifests + touch data/manifests/.himia.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to data/musan + mkdir -p data/manifests + if [ ! -e data/manifests/.musan.done ]; then + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan.done + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Prepare aishell manifest" + # We assume that you have downloaded the aishell corpus + # to $dl_dir/aishell + if [ ! -f data/manifests/.aishell_manifests.done ]; then + mkdir -p data/manifests + lhotse prepare aishell $dl_dir/aishell data/manifests + touch data/manifests/.aishell_manifests.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compute fbank for aishell" + if [ ! -f data/fbank/.aishell.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_aishell.py \ + --enable-speed-perturb=${enable_speed_perturb} + touch data/fbank/.aishell.done + fi +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Compute fbank for musan" + mkdir -p data/fbank + if [ ! -e data/fbank/.musan.done ]; then + ./local/compute_fbank_musan.py + touch data/fbank/.musan.done + fi +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Compute fbank for HI_MIA and HI_MIA_CW dataset" + # Format of train_set_channel is "micropohone position"_"channel" + # Microphone 1 to 6 is an array with 16 channels. + # Microphone 8 only has a single channel. + # So valid examples of train_set_channel could be: + # 1_01, ..., 1_16 + # 2_01, ..., 2_16 + # ... + # 6_01, ..., 6_16 + # 7_01 + train_set_channel="_7_01" + for subset in train dev test; do + for file_type in recordings supervisions; do + src=data/manifests/himia_${file_type}_${subset}.jsonl.gz + dst=data/manifests/himia_${file_type}_${subset}${train_set_channel}.jsonl.gz + cat <(gunzip -c ${src}) | \ + grep ${train_set_channel} | \ + gzip -c > ${dst} + done + done + + mkdir -p data/fbank + if [ ! -e data/fbank/.himia.done ]; then + ./local/compute_fbank_himia.py \ + --train-set-channel=${train_set_channel} \ + --enable-speed-perturb=${enable_speed_perturb} + touch data/fbank/.himia.done + fi + + train_file=data/fbank/cuts_train_himia${train_set_channel}-aishell-shuf.jsonl.gz + if [ ! -f ${train_file} ]; then + # SingleCutSampler is prefered for this experiment. + # So `shuf` the training dataset here. + cat <(gunzip -c data/fbank/aishell_cuts_train.jsonl.gz) \ + <(gunzip -c data/fbank/cuts_train${train_set_channel}.jsonl.gz) | \ + grep -v _sp | \ + shuf |shuf | gzip -c > ${train_file} + fi + +fi +