From 65fd98174727e97968680cab18ee81948767b276 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Wed, 20 Apr 2022 17:21:31 +0800
Subject: [PATCH] Disable speed perturbe for XL subset.

---
 .../local/compute_fbank_gigaspeech_splits.py  | 13 ++++++-----
 .../ASR/local/preprocess_gigaspeech.py        | 23 +++++++++++--------
 egs/librispeech/ASR/prepare_giga_speech.sh    |  9 ++++----
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
index 13fd9d963..a7ed2467d 100644
--- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
+++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
@@ -18,15 +18,12 @@
 
 import argparse
 import logging
+import os
 from datetime import datetime
 from pathlib import Path
 
 import torch
-from lhotse import (
-    CutSet,
-    KaldifeatFbank,
-    KaldifeatFbankConfig,
-)
+from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -99,8 +96,9 @@ def compute_fbank_gigaspeech_splits(args):
     extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
     logging.info(f"device: {device}")
 
+    num_digits = 8  # num_digits is fixed by lhotse split-lazy
     for i in range(start, stop):
-        idx = i
+        idx = f"{i + 1}".zfill(num_digits)
         logging.info(f"Processing {idx}/{num_splits}")
 
         cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
@@ -117,6 +115,9 @@ def compute_fbank_gigaspeech_splits(args):
         cut_set = CutSet.from_file(raw_cuts_path)
 
         logging.info("Computing features")
+        if (output_dir / f"feats_XL_{idx}.lca").exists():
+            logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca")
+            os.remove(output_dir / f"feats_XL_{idx}.lca")
 
         cut_set = cut_set.compute_and_store_features_batch(
             extractor=extractor,
diff --git a/egs/librispeech/ASR/local/preprocess_gigaspeech.py b/egs/librispeech/ASR/local/preprocess_gigaspeech.py
index 01229d85a..474f7b32f 100644
--- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py
+++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py
@@ -91,16 +91,19 @@ def preprocess_giga_speech():
         )
         # Run data augmentation that needs to be done in the
         # time domain.
-        if partition not in ["DEV", "TEST"]:
-            logging.info(
-                f"Speed perturb for {partition} with factors 0.9 and 1.1 "
-                "(Perturbing may take 8 minutes and saving may take 20 minutes)"
-            )
-            cut_set = (
-                cut_set
-                + cut_set.perturb_speed(0.9)
-                + cut_set.perturb_speed(1.1)
-            )
+        #  if partition not in ["DEV", "TEST"]:
+        #      logging.info(
+        #          f"Speed perturb for {partition} with factors 0.9 and 1.1 "
+        #          "(Perturbing may take 8 minutes and saving may take 20 minutes)"
+        #      )
+        #      cut_set = (
+        #          cut_set
+        #          + cut_set.perturb_speed(0.9)
+        #          + cut_set.perturb_speed(1.1)
+        #      )
+        #
+        # Note: No need to perturb the training subset as not all of the
+        # data is going to be used in the training.
         logging.info(f"Saving to {raw_cuts_path}")
         cut_set.to_file(raw_cuts_path)
 
diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh
index 16316aa29..26b921eab 100755
--- a/egs/librispeech/ASR/prepare_giga_speech.sh
+++ b/egs/librispeech/ASR/prepare_giga_speech.sh
@@ -28,10 +28,10 @@ stop_stage=100
 # This is to avoid OOM during feature extraction.
 num_splits=2000
 # We use lazy split from lhotse.
-# The XL subset contains 113916 cuts after speed perturbing with factors
-# 0.9 and 1.1. We want to split it into 2000 splits, so each split
-# contains about 113916 / 2000 = 57 cuts. As a result, there will be 1999 splits.
-chunk_size=57 # number of cuts in each split. The last split may contain fewer cuts.
+# The XL subset (10k hours) contains 37956 cuts without speed perturbing.
+# We want to split it into 2000 splits, so each split
+# contains about 37956 / 2000 = 19 cuts. As a result, there will be 1998 splits.
+chunk_size=19 # number of cuts in each split. The last split may contain fewer cuts.
 
 dl_dir=$PWD/download
 
@@ -130,6 +130,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
     touch $split_dir/.split_completed
   fi
 fi
+
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   log "Stage 5: Compute features for XL"
   # Note: The script supports --start and --stop options.