From 4e05213f87e60b26314045588f6e0344704605af Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 16 Apr 2022 12:51:13 +0800 Subject: [PATCH] Feature extraction code for GigaSpeech. --- egs/librispeech/ASR/.gitignore | 1 + .../compute_fbank_gigaspeech_dev_test.py | 92 ++++++++++ .../local/compute_fbank_gigaspeech_splits.py | 168 ++++++++++++++++++ .../ASR/local/preprocess_gigaspeech.py | 5 - egs/librispeech/ASR/prepare_giga_speech.sh | 40 +++++ 5 files changed, 301 insertions(+), 5 deletions(-) create mode 100644 egs/librispeech/ASR/.gitignore create mode 100644 egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py create mode 100644 egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py diff --git a/egs/librispeech/ASR/.gitignore b/egs/librispeech/ASR/.gitignore new file mode 100644 index 000000000..5592679cc --- /dev/null +++ b/egs/librispeech/ASR/.gitignore @@ -0,0 +1 @@ +log-* diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py new file mode 100644 index 000000000..9f1039893 --- /dev/null +++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# Copyright 2021 Johns Hopkins University (Piotr Żelasko) +# Copyright 2021 Xiaomi Corp. (Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pathlib import Path + +import torch +from lhotse import ( + CutSet, + KaldifeatFbank, + KaldifeatFbankConfig, +) + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def compute_fbank_gigaspeech_dev_test(): + in_out_dir = Path("data/fbank") + # number of workers in dataloader + num_workers = 20 + + # number of seconds in a batch + batch_duration = 600 + + subsets = ("DEV", "TEST") + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) + + logging.info(f"device: {device}") + + for partition in subsets: + cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{cuts_path} exists - skipping") + continue + + raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz" + + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) + + logging.info("Computing features") + + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{in_out_dir}/feats_{partition}", + num_workers=num_workers, + batch_duration=batch_duration, + ) + cut_set = cut_set.trim_to_supervisions( + keep_overlapping=False, min_duration=None + ) + + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) + logging.info(f"Saved to {cuts_path}") + + +def main(): + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + logging.basicConfig(format=formatter, level=logging.INFO) + + compute_fbank_gigaspeech_dev_test() + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py new file mode 100644 index 000000000..13fd9d963 --- /dev/null +++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# Copyright 2021 Johns Hopkins University (Piotr Żelasko) +# Copyright 2021 Xiaomi Corp. (Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +from datetime import datetime +from pathlib import Path + +import torch +from lhotse import ( + CutSet, + KaldifeatFbank, + KaldifeatFbankConfig, +) + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--num-workers", + type=int, + default=20, + help="Number of dataloading workers used for reading the audio.", + ) + parser.add_argument( + "--batch-duration", + type=float, + default=600.0, + help="The maximum number of audio seconds in a batch." + "Determines batch size dynamically.", + ) + + parser.add_argument( + "--num-splits", + type=int, + required=True, + help="The number of splits of the XL subset", + ) + + parser.add_argument( + "--start", + type=int, + default=0, + help="Process pieces starting from this number (inclusive).", + ) + + parser.add_argument( + "--stop", + type=int, + default=-1, + help="Stop processing pieces until this number (exclusive).", + ) + return parser + + +def compute_fbank_gigaspeech_splits(args): + num_splits = args.num_splits + output_dir = f"data/fbank/XL_split_{num_splits}" + output_dir = Path(output_dir) + assert output_dir.exists(), f"{output_dir} does not exist!" + + num_digits = len(str(num_splits)) + + start = args.start + stop = args.stop + if stop < start: + stop = num_splits + + stop = min(stop, num_splits) + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) + logging.info(f"device: {device}") + + for i in range(start, stop): + idx = i + logging.info(f"Processing {idx}/{num_splits}") + + cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{cuts_path} exists - skipping") + continue + + raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz" + if not raw_cuts_path.is_file(): + logging.info(f"{raw_cuts_path} does not exist - skipping it") + continue + + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) + + logging.info("Computing features") + + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{output_dir}/feats_XL_{idx}", + num_workers=args.num_workers, + batch_duration=args.batch_duration, + ) + + logging.info("About to split cuts into smaller chunks.") + cut_set = cut_set.trim_to_supervisions( + keep_overlapping=False, min_duration=None + ) + + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) + logging.info(f"Saved to {cuts_path}") + + +def main(): + now = datetime.now() + date_time = now.strftime("%Y-%m-%d-%H-%M-%S") + + log_filename = "log-compute_fbank_gigaspeech_splits" + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + log_filename = f"{log_filename}-{date_time}" + + logging.basicConfig( + filename=log_filename, + format=formatter, + level=logging.INFO, + filemode="w", + ) + + console = logging.StreamHandler() + console.setLevel(logging.INFO) + console.setFormatter(logging.Formatter(formatter)) + logging.getLogger("").addHandler(console) + + parser = get_parser() + args = parser.parse_args() + logging.info(vars(args)) + + compute_fbank_gigaspeech_splits(args) + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/ASR/local/preprocess_gigaspeech.py b/egs/librispeech/ASR/local/preprocess_gigaspeech.py index 4168a7185..01229d85a 100644 --- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py +++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py @@ -101,11 +101,6 @@ def preprocess_giga_speech(): + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) ) - - logging.info("About to split cuts into smaller chunks.") - cut_set = cut_set.trim_to_supervisions( - keep_overlapping=False, min_duration=None - ) logging.info(f"Saving to {raw_cuts_path}") cut_set.to_file(raw_cuts_path) diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh index 49124c4d7..8eec4ac3e 100755 --- a/egs/librispeech/ASR/prepare_giga_speech.sh +++ b/egs/librispeech/ASR/prepare_giga_speech.sh @@ -24,6 +24,15 @@ stop_stage=100 # DEV 12 hours # Test 40 hours +# Split XL subset to this number of pieces +# This is to avoid OOM during feature extraction. +num_splits=2000 +# We use lazy split from lhotse. +# The XL subset contains 113916 cuts after speed perturbing with factors +# 0.9 and 1.1. We want to split it into 2000 splits, so each split +# contains about 113916 / 2000 = 57 cuts. As a result, there will be 1999 splits. +chunk_size=57 # number of cuts in each split. The last split may contain fewer cuts. + dl_dir=$PWD/download . shared/parse_options.sh || exit 1 @@ -107,3 +116,34 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then touch data/fbank/.preprocess_complete fi fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)" + python3 ./local/compute_fbank_gigaspeech_dev_test.py +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Split XL subset into ${num_splits} pieces" + split_dir=data/fbank/XL_split_${num_splits} + if [ ! -f $split_dir/.split_completed ]; then + lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size + touch $split_dir/.split_completed + fi +fi +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Compute features for XL" + # Note: The script supports --start and --stop options. + # You can use several machines to compute the features in parallel. + python3 ./local/compute_fbank_gigaspeech_splits.py \ + --num-workers $nj \ + --batch-duration 600 \ + --num-splits $num_splits +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Combine features for XL" + if [ ! -f data/fbank/cuts_XL.jsonl.gz ]; then + pieces=$(find data/fbank/XL_split_${num_splits} -name "cuts_XL.*.jsonl.gz") + lhotse combine $pieces data/fbank/cuts_XL.jsonl.gz + fi +fi