From 1c4db887476944bd70f17e92f21a796cb36e146d Mon Sep 17 00:00:00 2001
From: yfy62 <yfy62@d3-hpc-sjtu-test-005.cm.cluster>
Date: Fri, 10 Nov 2023 21:17:35 +0800
Subject: [PATCH] update

---
 .../attach_discrete_tokens_to_supervisions.py | 20 ++++++++
 .../local/preprocess_gigaspeech.py            | 49 +++++--------------
 .../local/preprocess_raw_cuts.py              | 18 +++++++
 3 files changed, 49 insertions(+), 38 deletions(-)
 create mode 100755 egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py
 create mode 100644 egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py

diff --git a/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py b/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py
new file mode 100755
index 000000000..bd5742f00
--- /dev/null
+++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py
@@ -0,0 +1,20 @@
+import jsonlines
+from tqdm import tqdm
+
+with open(
+    "/mnt/lustre/sjtu/home/yfy62/discrete_token_data/GigaSpeech/xl/wavlm_large_l21_kms2000/out_quantized_sp1.1"
+) as f:
+    discrete_tokens = f.read().splitlines()
+
+discrete_tokens_info = {}
+for discrete_token in discrete_tokens:
+    discrete_token = discrete_token.split(" ", 1)
+    discrete_tokens_info[discrete_token[0]] = discrete_token[1]
+
+
+with jsonlines.open("gigaspeech_supervisions_XL.jsonl") as reader:
+    with jsonlines.open("gigaspeech_supervisions_XL_new.jsonl", mode="w") as writer:
+        for obj in tqdm(reader):
+            obj["custom"] = {"discrete_tokens": discrete_tokens_info[obj["id"]]}
+
+            writer.write(obj)
diff --git a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py
index 0120b577d..cc169d577 100755
--- a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py
+++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py
@@ -21,10 +21,11 @@ import re
 from pathlib import Path
 
 import jsonlines
+from tqdm import tqdm
+
 from lhotse import CutSet, SupervisionSegment
 from lhotse.recipes.utils import read_manifests_if_cached
 from lhotse.serialization import open_best
-from tqdm import tqdm
 
 # Similar text filtering and normalization procedure as in:
 # https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
@@ -39,32 +40,26 @@ def normalize_text(
 
 
 def has_no_oov(
-    sup: SupervisionSegment,
-    oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
+    sup: SupervisionSegment, oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
 ) -> bool:
     return oov_pattern.search(sup.text) is None
 
 
 def preprocess_gigaspeech():
-    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    # src_dir = Path("data/manifests")
+    # output_dir = Path("data/fbank")
+    src_dir = Path(".")
+    output_dir = Path(".")
     output_dir.mkdir(exist_ok=True)
 
-    dataset_parts = (
-        "DEV",
-        "TEST",
-        "M",
-    )
+    dataset_parts = ("XL",)
 
     prefix = "gigaspeech"
     suffix = "jsonl.gz"
 
     logging.info("Loading manifest (may take 1 minutes)")
     manifests = read_manifests_if_cached(
-        dataset_parts=dataset_parts,
-        output_dir=src_dir,
-        prefix=prefix,
-        suffix=suffix,
+        dataset_parts=dataset_parts, output_dir=src_dir, prefix=prefix, suffix=suffix,
     )
     assert manifests is not None
 
@@ -76,7 +71,7 @@ def preprocess_gigaspeech():
     )
 
     for partition, m in manifests.items():
-        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl"
         if raw_cuts_path.is_file():
             logging.info(f"{partition} already exists - skipping")
             continue
@@ -93,8 +88,7 @@ def preprocess_gigaspeech():
         # Create long-recording cut manifests.
         logging.info(f"Preprocessing {partition}")
         cut_set = CutSet.from_manifests(
-            recordings=m["recordings"],
-            supervisions=m["supervisions"],
+            recordings=m["recordings"], supervisions=m["supervisions"],
         )
 
         logging.info("About to split cuts into smaller chunks.")
@@ -105,27 +99,6 @@ def preprocess_gigaspeech():
         logging.info(f"Saving to {raw_cuts_path}")
         cut_set.to_file(raw_cuts_path)
 
-    for partition in dataset_parts:
-        cuts_path = output_dir / f"{prefix}_cuts_{partition}.jsonl"
-        if cuts_path.is_file():
-            logging.info(f"{partition} already exists - skipping")
-            continue
-
-        logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
-        with open_best(raw_cuts_path) as reader, jsonlines.open(
-            cuts_path, "a"
-        ) as writer:
-            for cut in reader:
-                cut = eval(cut)
-                cut["custom"] = {
-                    "discrete_tokens": cut["supervisions"][0]["custom"][
-                        "discrete_tokens"
-                    ]
-                }
-                del cut["supervisions"][0]["custom"]
-                writer.write(cut)
-
 
 def main():
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
diff --git a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py
new file mode 100644
index 000000000..0e44223d9
--- /dev/null
+++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py
@@ -0,0 +1,18 @@
+import jsonlines
+from tqdm import tqdm
+
+with jsonlines.open("gigaspeech_cuts_XL_raw.jsonl") as reader:
+    with jsonlines.open("gigaspeech_cuts_XL.jsonl", mode="w") as writer:
+        for obj in tqdm(reader):
+            obj["custom"] = {
+                "discrete_tokens": obj["supervisions"][0]["custom"]["discrete_tokens"]
+            }
+            del obj["supervisions"][0]["custom"]
+
+            # Speed perturb
+            obj["duration"] /= 1.1
+            obj["supervisions"][0]["duration"] /= 1.1
+            obj["id"] += "_sp1.1"
+            obj["supervisions"][0]["id"] += "_sp1.1"
+
+            writer.write(obj)