update

2025-09-04 14:44:18 +00:00 · 2023-11-10 21:17:35 +08:00 · 2023-11-10 21:17:35 +08:00 · 1c4db88747
commit 1c4db88747
parent 4a9ea0ffc6
3 changed files with 49 additions and 38 deletions
--- a/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py
+++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py
@ -0,0 +1,20 @@
+import jsonlines
+from tqdm import tqdm
+
+with open(
+    "/mnt/lustre/sjtu/home/yfy62/discrete_token_data/GigaSpeech/xl/wavlm_large_l21_kms2000/out_quantized_sp1.1"
+) as f:
+    discrete_tokens = f.read().splitlines()
+
+discrete_tokens_info = {}
+for discrete_token in discrete_tokens:
+    discrete_token = discrete_token.split(" ", 1)
+    discrete_tokens_info[discrete_token[0]] = discrete_token[1]
+
+
+with jsonlines.open("gigaspeech_supervisions_XL.jsonl") as reader:
+    with jsonlines.open("gigaspeech_supervisions_XL_new.jsonl", mode="w") as writer:
+        for obj in tqdm(reader):
+            obj["custom"] = {"discrete_tokens": discrete_tokens_info[obj["id"]]}
+
+            writer.write(obj)
--- a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py
+++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py
@ -21,10 +21,11 @@ import re
 from pathlib import Path

 import jsonlines
+from tqdm import tqdm
+
 from lhotse import CutSet, SupervisionSegment
 from lhotse.recipes.utils import read_manifests_if_cached
 from lhotse.serialization import open_best
-from tqdm import tqdm

 # Similar text filtering and normalization procedure as in:
 # https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
@ -39,32 +40,26 @@ def normalize_text(


 def has_no_oov(
-    sup: SupervisionSegment,
-    oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
+    sup: SupervisionSegment, oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
 ) -> bool:
    return oov_pattern.search(sup.text) is None


 def preprocess_gigaspeech():
-    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    # src_dir = Path("data/manifests")
+    # output_dir = Path("data/fbank")
+    src_dir = Path(".")
+    output_dir = Path(".")
    output_dir.mkdir(exist_ok=True)

-    dataset_parts = (
-        "DEV",
-        "TEST",
-        "M",
-    )
+    dataset_parts = ("XL",)

    prefix = "gigaspeech"
    suffix = "jsonl.gz"

    logging.info("Loading manifest (may take 1 minutes)")
    manifests = read_manifests_if_cached(
-        dataset_parts=dataset_parts,
-        output_dir=src_dir,
-        prefix=prefix,
-        suffix=suffix,
+        dataset_parts=dataset_parts, output_dir=src_dir, prefix=prefix, suffix=suffix,
    )
    assert manifests is not None

@ -76,7 +71,7 @@ def preprocess_gigaspeech():
    )

    for partition, m in manifests.items():
-        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl"
        if raw_cuts_path.is_file():
            logging.info(f"{partition} already exists - skipping")
            continue
@ -93,8 +88,7 @@ def preprocess_gigaspeech():
        # Create long-recording cut manifests.
        logging.info(f"Preprocessing {partition}")
        cut_set = CutSet.from_manifests(
-            recordings=m["recordings"],
-            supervisions=m["supervisions"],
+            recordings=m["recordings"], supervisions=m["supervisions"],
        )

        logging.info("About to split cuts into smaller chunks.")
@ -105,27 +99,6 @@ def preprocess_gigaspeech():
        logging.info(f"Saving to {raw_cuts_path}")
        cut_set.to_file(raw_cuts_path)

-    for partition in dataset_parts:
-        cuts_path = output_dir / f"{prefix}_cuts_{partition}.jsonl"
-        if cuts_path.is_file():
-            logging.info(f"{partition} already exists - skipping")
-            continue
-
-        logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
-        with open_best(raw_cuts_path) as reader, jsonlines.open(
-            cuts_path, "a"
-        ) as writer:
-            for cut in reader:
-                cut = eval(cut)
-                cut["custom"] = {
-                    "discrete_tokens": cut["supervisions"][0]["custom"][
-                        "discrete_tokens"
-                    ]
-                }
-                del cut["supervisions"][0]["custom"]
-                writer.write(cut)
-

 def main():
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
--- a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py
+++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py
@ -0,0 +1,18 @@
+import jsonlines
+from tqdm import tqdm
+
+with jsonlines.open("gigaspeech_cuts_XL_raw.jsonl") as reader:
+    with jsonlines.open("gigaspeech_cuts_XL.jsonl", mode="w") as writer:
+        for obj in tqdm(reader):
+            obj["custom"] = {
+                "discrete_tokens": obj["supervisions"][0]["custom"]["discrete_tokens"]
+            }
+            del obj["supervisions"][0]["custom"]
+
+            # Speed perturb
+            obj["duration"] /= 1.1
+            obj["supervisions"][0]["duration"] /= 1.1
+            obj["id"] += "_sp1.1"
+            obj["supervisions"][0]["id"] += "_sp1.1"
+
+            writer.write(obj)