Disable speed perturbe for XL subset.

This commit is contained in:
Fangjun Kuang 2022-04-20 17:21:31 +08:00
parent e32641d1df
commit 65fd981747
3 changed files with 25 additions and 20 deletions

View File

@ -18,15 +18,12 @@
import argparse import argparse
import logging import logging
import os
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ( from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
CutSet,
KaldifeatFbank,
KaldifeatFbankConfig,
)
# Torch's multithreaded behavior needs to be disabled or # Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down. # it wastes a lot of CPU and slow things down.
@ -99,8 +96,9 @@ def compute_fbank_gigaspeech_splits(args):
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}") logging.info(f"device: {device}")
num_digits = 8 # num_digits is fixed by lhotse split-lazy
for i in range(start, stop): for i in range(start, stop):
idx = i idx = f"{i + 1}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}") logging.info(f"Processing {idx}/{num_splits}")
cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz" cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
@ -117,6 +115,9 @@ def compute_fbank_gigaspeech_splits(args):
cut_set = CutSet.from_file(raw_cuts_path) cut_set = CutSet.from_file(raw_cuts_path)
logging.info("Computing features") logging.info("Computing features")
if (output_dir / f"feats_XL_{idx}.lca").exists():
logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca")
os.remove(output_dir / f"feats_XL_{idx}.lca")
cut_set = cut_set.compute_and_store_features_batch( cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor, extractor=extractor,

View File

@ -91,16 +91,19 @@ def preprocess_giga_speech():
) )
# Run data augmentation that needs to be done in the # Run data augmentation that needs to be done in the
# time domain. # time domain.
if partition not in ["DEV", "TEST"]: # if partition not in ["DEV", "TEST"]:
logging.info( # logging.info(
f"Speed perturb for {partition} with factors 0.9 and 1.1 " # f"Speed perturb for {partition} with factors 0.9 and 1.1 "
"(Perturbing may take 8 minutes and saving may take 20 minutes)" # "(Perturbing may take 8 minutes and saving may take 20 minutes)"
) # )
cut_set = ( # cut_set = (
cut_set # cut_set
+ cut_set.perturb_speed(0.9) # + cut_set.perturb_speed(0.9)
+ cut_set.perturb_speed(1.1) # + cut_set.perturb_speed(1.1)
) # )
#
# Note: No need to perturb the training subset as not all of the
# data is going to be used in the training.
logging.info(f"Saving to {raw_cuts_path}") logging.info(f"Saving to {raw_cuts_path}")
cut_set.to_file(raw_cuts_path) cut_set.to_file(raw_cuts_path)

View File

@ -28,10 +28,10 @@ stop_stage=100
# This is to avoid OOM during feature extraction. # This is to avoid OOM during feature extraction.
num_splits=2000 num_splits=2000
# We use lazy split from lhotse. # We use lazy split from lhotse.
# The XL subset contains 113916 cuts after speed perturbing with factors # The XL subset (10k hours) contains 37956 cuts without speed perturbing.
# 0.9 and 1.1. We want to split it into 2000 splits, so each split # We want to split it into 2000 splits, so each split
# contains about 113916 / 2000 = 57 cuts. As a result, there will be 1999 splits. # contains about 37956 / 2000 = 19 cuts. As a result, there will be 1998 splits.
chunk_size=57 # number of cuts in each split. The last split may contain fewer cuts. chunk_size=19 # number of cuts in each split. The last split may contain fewer cuts.
dl_dir=$PWD/download dl_dir=$PWD/download
@ -130,6 +130,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
touch $split_dir/.split_completed touch $split_dir/.split_completed
fi fi
fi fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute features for XL" log "Stage 5: Compute features for XL"
# Note: The script supports --start and --stop options. # Note: The script supports --start and --stop options.