From 65fd98174727e97968680cab18ee81948767b276 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 20 Apr 2022 17:21:31 +0800 Subject: [PATCH] Disable speed perturbe for XL subset. --- .../local/compute_fbank_gigaspeech_splits.py | 13 ++++++----- .../ASR/local/preprocess_gigaspeech.py | 23 +++++++++++-------- egs/librispeech/ASR/prepare_giga_speech.sh | 9 ++++---- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py index 13fd9d963..a7ed2467d 100644 --- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -18,15 +18,12 @@ import argparse import logging +import os from datetime import datetime from pathlib import Path import torch -from lhotse import ( - CutSet, - KaldifeatFbank, - KaldifeatFbankConfig, -) +from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig # Torch's multithreaded behavior needs to be disabled or # it wastes a lot of CPU and slow things down. @@ -99,8 +96,9 @@ def compute_fbank_gigaspeech_splits(args): extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) logging.info(f"device: {device}") + num_digits = 8 # num_digits is fixed by lhotse split-lazy for i in range(start, stop): - idx = i + idx = f"{i + 1}".zfill(num_digits) logging.info(f"Processing {idx}/{num_splits}") cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz" @@ -117,6 +115,9 @@ def compute_fbank_gigaspeech_splits(args): cut_set = CutSet.from_file(raw_cuts_path) logging.info("Computing features") + if (output_dir / f"feats_XL_{idx}.lca").exists(): + logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca") + os.remove(output_dir / f"feats_XL_{idx}.lca") cut_set = cut_set.compute_and_store_features_batch( extractor=extractor, diff --git a/egs/librispeech/ASR/local/preprocess_gigaspeech.py b/egs/librispeech/ASR/local/preprocess_gigaspeech.py index 01229d85a..474f7b32f 100644 --- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py +++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py @@ -91,16 +91,19 @@ def preprocess_giga_speech(): ) # Run data augmentation that needs to be done in the # time domain. - if partition not in ["DEV", "TEST"]: - logging.info( - f"Speed perturb for {partition} with factors 0.9 and 1.1 " - "(Perturbing may take 8 minutes and saving may take 20 minutes)" - ) - cut_set = ( - cut_set - + cut_set.perturb_speed(0.9) - + cut_set.perturb_speed(1.1) - ) + # if partition not in ["DEV", "TEST"]: + # logging.info( + # f"Speed perturb for {partition} with factors 0.9 and 1.1 " + # "(Perturbing may take 8 minutes and saving may take 20 minutes)" + # ) + # cut_set = ( + # cut_set + # + cut_set.perturb_speed(0.9) + # + cut_set.perturb_speed(1.1) + # ) + # + # Note: No need to perturb the training subset as not all of the + # data is going to be used in the training. logging.info(f"Saving to {raw_cuts_path}") cut_set.to_file(raw_cuts_path) diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh index 16316aa29..26b921eab 100755 --- a/egs/librispeech/ASR/prepare_giga_speech.sh +++ b/egs/librispeech/ASR/prepare_giga_speech.sh @@ -28,10 +28,10 @@ stop_stage=100 # This is to avoid OOM during feature extraction. num_splits=2000 # We use lazy split from lhotse. -# The XL subset contains 113916 cuts after speed perturbing with factors -# 0.9 and 1.1. We want to split it into 2000 splits, so each split -# contains about 113916 / 2000 = 57 cuts. As a result, there will be 1999 splits. -chunk_size=57 # number of cuts in each split. The last split may contain fewer cuts. +# The XL subset (10k hours) contains 37956 cuts without speed perturbing. +# We want to split it into 2000 splits, so each split +# contains about 37956 / 2000 = 19 cuts. As a result, there will be 1998 splits. +chunk_size=19 # number of cuts in each split. The last split may contain fewer cuts. dl_dir=$PWD/download @@ -130,6 +130,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then touch $split_dir/.split_completed fi fi + if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Compute features for XL" # Note: The script supports --start and --stop options.