diff --git a/egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_splits.py b/egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_splits.py new file mode 100755 index 000000000..7f5143cfc --- /dev/null +++ b/egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_splits.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# Copyright 2023 Xiaomi Corp. (Yifan Yang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +from datetime import datetime +from pathlib import Path + +import torch +from lhotse import ( + CutSet, + KaldifeatFbank, + KaldifeatFbankConfig, + LilcomChunkyWriter, + set_audio_duration_mismatch_tolerance, + set_caching_enabled, +) + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--num-workers", + type=int, + default=20, + help="Number of dataloading workers used for reading the audio.", + ) + + parser.add_argument( + "--batch-duration", + type=float, + default=600.0, + help="The maximum number of audio seconds in a batch." + "Determines batch size dynamically.", + ) + + parser.add_argument( + "--num-splits", + type=int, + required=True, + help="The number of splits of the train subset", + ) + + parser.add_argument( + "--start", + type=int, + default=0, + help="Process pieces starting from this number (inclusive).", + ) + + parser.add_argument( + "--stop", + type=int, + default=-1, + help="Stop processing pieces until this number (exclusive).", + ) + + return parser.parse_args() + + +def compute_fbank_bengaliai_speech_splits(args): + num_splits = args.num_splits + output_dir = f"data/fbank/bengaliai_speech_train_split" + output_dir = Path(output_dir) + assert output_dir.exists(), f"{output_dir} does not exist!" + + num_digits = 8 + + start = args.start + stop = args.stop + if stop < start: + stop = num_splits + + stop = min(stop, num_splits) + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) + logging.info(f"device: {device}") + + set_audio_duration_mismatch_tolerance(0.01) # 10ms tolerance + set_caching_enabled(False) + + for i in range(start, stop): + idx = f"{i + 1}".zfill(num_digits) + logging.info(f"Processing train split: {idx}") + + cuts_path = output_dir / f"bengaliai_speech_cuts_train.{idx}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{cuts_path} exists - skipping") + continue + + raw_cuts_path = ( + output_dir / f"bengaliai_speech_cuts_train_raw.{idx}.jsonl.gz" + ) + + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) + + logging.info("Splitting cuts into smaller chunks.") + cut_set = cut_set.trim_to_supervisions( + keep_overlapping=False, min_duration=None + ) + + logging.info("Computing features") + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{output_dir}/bengaliai_speech_feats_train_{idx}", + num_workers=args.num_workers, + batch_duration=args.batch_duration, + storage_type=LilcomChunkyWriter, + overwrite=True, + ) + + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) + + +def main(): + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + logging.basicConfig(format=formatter, level=logging.INFO) + args = get_args() + logging.info(vars(args)) + compute_fbank_bengaliai_speech_splits(args) + + +if __name__ == "__main__": + main() diff --git a/egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_valid_test.py b/egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_valid_test.py new file mode 100755 index 000000000..f251d56e6 --- /dev/null +++ b/egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_valid_test.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This file computes fbank features of the Bengali.AI Speech dataset. +It looks for manifests in the directory data/manifests. + +The generated fbank features are saved in data/fbank. +""" + +import argparse +import logging +import os +from pathlib import Path +from typing import Optional + +import torch +from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig, LilcomChunkyWriter + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def compute_fbank_bengaliai_speech_valid_test(): + src_dir = Path(f"data/manifests") + output_dir = Path(f"data/fbank") + num_workers = 42 + batch_duration = 600 + + subsets = ("valid", "test") + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) + + logging.info(f"device: {device}") + + for partition in subsets: + cuts_path = output_dir / f"bengaliai_speech_cuts_{partition}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{partition} already exists - skipping.") + continue + + raw_cuts_path = output_dir / f"bengaliai_speech_cuts_{partition}_raw.jsonl.gz" + + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) + + logging.info("Splitting cuts into smaller chunks") + cut_set = cut_set.trim_to_supervisions( + keep_overlapping=False, min_duration=None + ) + + logging.info("Computing features") + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{output_dir}/bengaliai_speech_feats_{partition}", + num_workers=num_workers, + batch_duration=batch_duration, + storage_type=LilcomChunkyWriter, + overwrite=True, + ) + + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) + + +if __name__ == "__main__": + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + logging.basicConfig(format=formatter, level=logging.INFO) + compute_fbank_bengaliai_speech_valid_test() diff --git a/egs/bengaliai_speech/ASR/prepare.sh b/egs/bengaliai_speech/ASR/prepare.sh index 45931abc4..eb764a59b 100755 --- a/egs/bengaliai_speech/ASR/prepare.sh +++ b/egs/bengaliai_speech/ASR/prepare.sh @@ -135,7 +135,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then --num-workers $nj \ --batch-duration 600 \ --start 0 \ - --num-splits 2000 + --num-splits 300 touch data/fbank/.bengaliai_speech_train.done fi fi @@ -159,10 +159,8 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then log "Generate data for BPE training" file=$( - find "data/fbank/bengaliai_speech_cuts_dirty_raw.jsonl.gz" - find "data/fbank/bengaliai_speech_cuts_dirty_sa_raw.jsonl.gz" - find "data/fbank/bengaliai_speech_cuts_clean_raw.jsonl.gz" - find "data/fbank/bengaliai_speech_cuts_clean_sa_raw.jsonl.gz" + find "data/fbank/bengaliai_speech_cuts_train_raw.jsonl.gz" + find "data/fbank/bengaliai_speech_cuts_valid_raw.jsonl.gz" ) gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt