From 317f5ec64eb3c26902aab9cecaea8d3718c61e75 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 28 Nov 2021 13:24:05 +0800 Subject: [PATCH] Compute features for GigaSpeech by splitting the manifest. --- egs/gigaspeech/ASR/.gitignore | 1 + .../compute_fbank_gigaspeech_dev_test.py | 90 +++++++++++ .../local/compute_fbank_gigaspeech_splits.py | 146 ++++++++++++++++++ .../ASR/local/preprocess_gigaspeech.py | 113 ++++++++++++++ egs/gigaspeech/ASR/prepare.sh | 80 ++++++---- 5 files changed, 403 insertions(+), 27 deletions(-) create mode 100644 egs/gigaspeech/ASR/.gitignore create mode 100755 egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py create mode 100755 egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py create mode 100755 egs/gigaspeech/ASR/local/preprocess_gigaspeech.py diff --git a/egs/gigaspeech/ASR/.gitignore b/egs/gigaspeech/ASR/.gitignore new file mode 100644 index 000000000..5592679cc --- /dev/null +++ b/egs/gigaspeech/ASR/.gitignore @@ -0,0 +1 @@ +log-* diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py new file mode 100755 index 000000000..59f60939b --- /dev/null +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# Copyright 2021 Johns Hopkins University (Piotr Żelasko) +# Copyright 2021 Xiaomi Corp. (Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pathlib import Path + +import torch +from lhotse import ( + CutSet, + KaldifeatFbank, + KaldifeatFbankConfig, + LilcomHdf5Writer, +) + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def compute_fbank_gigaspeech_dev_test(): + in_out_dir = Path("data/fbank") + # number of workers in dataloader + num_workers = 20 + + # number of seconds in a batch + batch_duration = 600 + + subsets = ("DEV", "TEST") + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) + + logging.info(f"device: {device}") + + for partition in subsets: + cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{cuts_path} exists - skipping") + continue + + raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz" + + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) + + logging.info("Computing features") + + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{in_out_dir}/feats_{partition}", + num_workers=num_workers, + batch_duration=batch_duration, + storage_type=LilcomHdf5Writer, + ) + + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) + + +def main(): + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + logging.basicConfig(format=formatter, level=logging.INFO) + + compute_fbank_gigaspeech_dev_test() + + +if __name__ == "__main__": + main() diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py new file mode 100755 index 000000000..acbb418ad --- /dev/null +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# Copyright 2021 Johns Hopkins University (Piotr Żelasko) +# Copyright 2021 Xiaomi Corp. (Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +from datetime import datetime +from pathlib import Path + +import torch +from lhotse import ( + CutSet, + KaldifeatFbank, + KaldifeatFbankConfig, + LilcomHdf5Writer, +) + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--num-workers", + type=int, + default=20, + help="Number of dataloading workers used for reading the audio.", + ) + parser.add_argument( + "--batch-duration", + type=float, + default=600.0, + help="The maximum number of audio seconds in a batch." + "Determines batch size dynamically.", + ) + + parser.add_argument( + "--num-splits", + type=int, + required=True, + help="The number of splits of the XL subset", + ) + return parser + + +def compute_fbank_gigaspeech_splits(args): + num_splits = args.num_splits + output_dir = f"data/fbank/XL_split_{num_splits}" + output_dir = Path(output_dir) + assert output_dir.exists(), f"{output_dir} does not exist!" + + num_digits = len(str(num_splits)) + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) + logging.info(f"device: {device}") + + for i in range(num_splits): + idx = f"{i + 1}".zfill(num_digits) + logging.info(f"Processing {idx}/{num_splits}") + + cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{cuts_path} exists - skipping") + continue + + raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz" + + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) + + logging.info("Computing features") + + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{output_dir}/feats_XL_{idx}", + num_workers=args.num_workers, + batch_duration=args.batch_duration, + storage_type=LilcomHdf5Writer, + ) + + logging.info("About to split cuts into smaller chunks.") + cut_set = cut_set.trim_to_supervisions( + keep_overlapping=False, min_duration=None + ) + + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) + logging.info(f"Saved to {cuts_path}") + + +def main(): + now = datetime.now() + date_time = now.strftime("%Y-%m-%d-%H-%M-%S") + + log_filename = "log-compute_fbank_gigaspeech_splits" + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + log_filename = f"{log_filename}-{date_time}" + + logging.basicConfig( + filename=log_filename, + format=formatter, + level=logging.INFO, + filemode="w", + ) + + console = logging.StreamHandler() + console.setLevel(logging.INFO) + console.setFormatter(logging.Formatter(formatter)) + logging.getLogger("").addHandler(console) + + parser = get_parser() + args = parser.parse_args() + logging.info(vars(args)) + + compute_fbank_gigaspeech_splits(args) + + +if __name__ == "__main__": + main() diff --git a/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py b/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py new file mode 100755 index 000000000..0cec82ad5 --- /dev/null +++ b/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# Copyright 2021 Johns Hopkins University (Piotr Żelasko) +# Copyright 2021 Xiaomi Corp. (Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import re +from pathlib import Path + +from lhotse import CutSet, SupervisionSegment +from lhotse.recipes.utils import read_manifests_if_cached + +# Similar text filtering and normalization procedure as in: +# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh + + +def normalize_text( + utt: str, + punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"), + whitespace_pattern=re.compile(r"\s\s+"), +) -> str: + return whitespace_pattern.sub(" ", punct_pattern.sub("", utt)) + + +def has_no_oov( + sup: SupervisionSegment, + oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"), +) -> bool: + return oov_pattern.search(sup.text) is None + + +def preprocess_giga_speech(): + src_dir = Path("data/manifests") + output_dir = Path("data/fbank") + output_dir.mkdir(exist_ok=True) + + dataset_parts = ( + "DEV", + "TEST", + "XL", + ) + + logging.info("Loading manifest (may take 4 minutes)") + manifests = read_manifests_if_cached( + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix="gigaspeech", + suffix="jsonl.gz", + ) + assert manifests is not None + + for partition, m in manifests.items(): + logging.info(f"Processing {partition}") + raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz" + if raw_cuts_path.is_file(): + logging.info(f"{partition} already exists - skipping") + continue + + # Note this step makes the recipe different than LibriSpeech: + # We must filter out some utterances and remove punctuation + # to be consistent with Kaldi. + logging.info("Filtering OOV utterances from supervisions") + m["supervisions"] = m["supervisions"].filter(has_no_oov) + logging.info(f"Normalizing text in {partition}") + for sup in m["supervisions"]: + sup.text = normalize_text(sup.text) + + # Create long-recording cut manifests. + logging.info(f"Processing {partition}") + cut_set = CutSet.from_manifests( + recordings=m["recordings"], + supervisions=m["supervisions"], + ) + # Run data augmentation that needs to be done in the + # time domain. + if partition not in ["DEV", "TEST"]: + logging.info( + f"Speed perturb for {partition} with factors 0.9 and 1.1 " + "(Perturbing may take 8 minutes and saving may take 20 minutes)" + ) + cut_set = ( + cut_set + + cut_set.perturb_speed(0.9) + + cut_set.perturb_speed(1.1) + ) + logging.info(f"Saving to {raw_cuts_path}") + cut_set.to_file(raw_cuts_path) + + +def main(): + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + logging.basicConfig(format=formatter, level=logging.INFO) + + preprocess_giga_speech() + + +if __name__ == "__main__": + main() diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh index 9f10414a2..48a5f880a 100755 --- a/egs/gigaspeech/ASR/prepare.sh +++ b/egs/gigaspeech/ASR/prepare.sh @@ -6,6 +6,10 @@ nj=15 stage=0 stop_stage=100 +# Split XL subset to this number of pieces +# This is to avoid OOM during feature extraction. +num_splits=1000 + # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded # by this script automatically. @@ -30,10 +34,8 @@ dl_dir=$PWD/download # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( - 5000 - # 2000 - # 1000 - # 500 + # 5000 + 500 ) # All files generated by this script are saved in "data". @@ -92,7 +94,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare GigaSpeech manifest" + log "Stage 1: Prepare GigaSpeech manifest (may take 15 minutes)" # We assume that you have downloaded the GigaSpeech corpus # to $dl_dir/GigaSpeech mkdir -p data/manifests @@ -109,27 +111,51 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for GigaSpeech" - mkdir -p data/fbank - # We assume you have a GPU card and implement CUDA extraction here. - # Since without CUDA it would take too much time to compute feats - # for L or XL subset, we recommend --precomputed-features False. - # - # We assume you have install kaldifeat, if not, please install - # it using: pip install kaldifeat - ./local/compute_fbank_gigaspeech.py --precomputed-features True \ - --num-workers 4 --batch-duration 600.0 \ - --context-window 0.0 --context-direction center + log "State 3: Preprocess GigaSpeech manifest" + if [ ! -f data/fbank/.preprocess_complete ]; then + python3 ./local/preprocess_gigaspeech.py + touch data/fbank/.preprocess_complete + fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Compute fbank for musan" + log "Stage 4: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)" + python3 ./local/compute_fbank_gigaspeech_dev_test.py +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Split XL subset into ${num_splits} pieces (may take 30 minutes)" + split_dir=data/fbank/XL_split_${num_splits} + if [ ! -f $split_dir/.split_completed ]; then + lhotse split $num_splits ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir + touch $split_dir/.split_completed + fi +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Compute features for XL" + python3 ./local/compute_fbank_gigaspeech_splits.py \ + --num-workers 20 \ + --batch-duration 600 \ + --num-splits $num_splits +fi + +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Combine features for XL" + if [ ! -f data/fbank/XL_split_${num_splits}/cuts_XL.json.gz ]; then + pieces=$(find data/fbank/XL_split_${num_splits} -name "cuts_XL.*.json.gz") + lhotse combine $pieces data/fbank/XL_split_${num_splits}/cuts_XL.json.gz + fi +fi + +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Compute fbank for musan" mkdir -p data/fbank ./local/compute_fbank_musan.py fi -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Prepare phone based lang" +if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then + log "Stage 9: Prepare phone based lang" lang_dir=data/lang_phone mkdir -p $lang_dir @@ -189,8 +215,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then mv $lang_dir/words $lang_dir/words.txt fi -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare BPE based lang" +if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then + log "Stage 10: Prepare BPE based lang" for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size} @@ -220,8 +246,8 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then done fi -if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 7: Prepare bigram P" +if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then + log "Stage 11: Prepare bigram P" for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size} @@ -251,8 +277,8 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then done fi -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Prepare G" +if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then + log "Stage 12: Prepare G" # We assume you have install kaldilm, if not, please install # it using: pip install kaldilm @@ -290,8 +316,8 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then fi fi -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Compile HLG" +if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then + log "Stage 13: Compile HLG" # ./local/compile_hlg.py --lang-dir data/lang_phone for vocab_size in ${vocab_sizes[@]}; do