From a1cdf09655ca5b0e39b3b1c97bad0a8f257bd9cc Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Fri, 5 Nov 2021 21:25:40 +0800 Subject: [PATCH] use KaldifeatFbank extractor --- .../ASR/conformer_ctc/gigaspeech_datamodule.py | 17 +++++++++++++---- egs/librispeech/ASR/prepare_gigaspeech.py | 8 +++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/egs/librispeech/ASR/conformer_ctc/gigaspeech_datamodule.py b/egs/librispeech/ASR/conformer_ctc/gigaspeech_datamodule.py index ee3b62a36..0698154ea 100644 --- a/egs/librispeech/ASR/conformer_ctc/gigaspeech_datamodule.py +++ b/egs/librispeech/ASR/conformer_ctc/gigaspeech_datamodule.py @@ -9,7 +9,7 @@ from typing import List, Union from torch.utils.data import DataLoader -from lhotse import CutSet, KaldifeatFbank, FbankConfig, load_manifest +from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig, load_manifest from lhotse.dataset import ( BucketingSampler, CutConcatenate, @@ -261,7 +261,10 @@ class GigaSpeechAsrDataModule(DataModule): train = K2SpeechRecognitionDataset( cut_transforms=transforms, input_strategy=OnTheFlyFeatures( - KaldifeatFbank(FbankConfig(num_mel_bins=80)), + # To avoid unexpected GPU OOM issue during training, + # I think using the cpu version is safer + # KaldifeatFbank(KaldifeatFbankConfig(device='cuda')), + KaldifeatFbank(KaldifeatFbankConfig()), num_workers=self.args.giga_num_workers_inner, ), return_cuts=self.args.giga_return_cuts, @@ -316,7 +319,10 @@ class GigaSpeechAsrDataModule(DataModule): validate = K2SpeechRecognitionDataset( cut_transforms=transforms, input_strategy=OnTheFlyFeatures( - KaldifeatFbank(FbankConfig(num_mel_bins=80)), num_workers=8 + # To avoid unexpected GPU OOM issue during training, + # I think using the cpu version is safer + # KaldifeatFbank(KaldifeatFbankConfig(device='cuda')), num_workers=8 + KaldifeatFbank(KaldifeatFbankConfig()), num_workers=8 ), return_cuts=self.args.giga_return_cuts, ) @@ -357,7 +363,10 @@ class GigaSpeechAsrDataModule(DataModule): logging.debug("About to create test dataset") test = K2SpeechRecognitionDataset( input_strategy=( - OnTheFlyFeatures(KaldifeatFbank(FbankConfig(num_mel_bins=80)), num_workers=8) + # To avoid unexpected GPU OOM issue during training, + # I think using the cpu version is safer + # OnTheFlyFeatures(KaldifeatFbank(KaldifeatFbankConfig(device='cuda')), num_workers=8) + OnTheFlyFeatures(KaldifeatFbank(KaldifeatFbankConfig()), num_workers=8) if self.args.giga_on_the_fly_feats else PrecomputedFeatures() ), diff --git a/egs/librispeech/ASR/prepare_gigaspeech.py b/egs/librispeech/ASR/prepare_gigaspeech.py index 22b5aab30..831ebea50 100755 --- a/egs/librispeech/ASR/prepare_gigaspeech.py +++ b/egs/librispeech/ASR/prepare_gigaspeech.py @@ -15,8 +15,8 @@ import torch from gigaspeech_datamodule import get_context_suffix from lhotse import ( CutSet, - Fbank, - FbankConfig, + KaldifeatFbank, + KaldifeatFbankConfig, LilcomHdf5Writer, SupervisionSegment, combine, @@ -183,7 +183,8 @@ def main(): ctx_suffix = get_context_suffix(args, subparser=False) print("Feature extraction:") - extractor = Fbank(FbankConfig(num_mel_bins=80)) + # extractor = Fbank(FbankConfig(num_mel_bins=80)) + extractor = KaldifeatFbank(KaldifeatFbankConfig(device='cuda')) # default config uses 80 mel bins already with get_executor() as ex: # Initialize the executor only once. for partition, manifests in gigaspeech_manifests.items(): raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz" @@ -268,6 +269,7 @@ def main(): storage_path=f"{output_dir}/feats_gigaspeech_{partition}", batch_duration=args.batch_duration, num_workers=args.num_workers, + storage_type=partial(LilcomHdf5Writer, tick_power=-3), )