Modified prepare_transcripts.py and preprare_lexicon.py of tedlium3 recipe (#567)

2025-12-11 06:55:27 +00:00 · 2022-09-09 21:32:49 -05:00 · 2022-09-09 21:32:49 -05:00 · 9e24642faf
commit 9e24642faf
parent e18fa78c3a
2 changed files with 16 additions and 22 deletions
--- a/egs/tedlium3/ASR/local/prepare_lexicon.py
+++ b/egs/tedlium3/ASR/local/prepare_lexicon.py
@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
 1. Generate lexicon_words.txt.
 """
 import lhotse
 import argparse
 import json
 import logging
 from pathlib import Path
@ -60,16 +60,13 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
    """
    words = set()
    supervisions_train = Path(manifests_dir) / "supervisions_train.json"
    lexicon = Path(lang_dir) / "lexicon_words.txt"
-
+    sups = lhotse.load_manifest(
-    logging.info(f"Loading {supervisions_train}!")
+        f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
-    with open(supervisions_train, "r") as load_f:
+    )
-        load_dicts = json.load(load_f)
+    for s in sups:
        for load_dict in load_dicts:
            text = load_dict["text"]
        # list the words units and filter the empty item
-            words_list = list(filter(None, text.split()))
+        words_list = list(filter(None, s.text.split()))
        for word in words_list:
            if word not in words and word != "<unk>":
--- a/egs/tedlium3/ASR/local/prepare_transcripts.py
+++ b/egs/tedlium3/ASR/local/prepare_transcripts.py
@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
 1. Generate train.text.
 """
 import lhotse
 import argparse
 import json
 import logging
 from pathlib import Path
@ -60,15 +60,12 @@ def prepare_transcripts(manifests_dir: str, lang_dir: str):
    """
    texts = []
    supervisions_train = Path(manifests_dir) / "supervisions_train.json"
    train_text = Path(lang_dir) / "train.text"
-
+    sups = lhotse.load_manifest(
-    logging.info(f"Loading {supervisions_train}!")
+        f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
-    with open(supervisions_train, "r") as load_f:
+    )
-        load_dicts = json.load(load_f)
+    for s in sups:
-        for load_dict in load_dicts:
+        texts.append(s.text)
            text = load_dict["text"]
            texts.append(text)
    with open(train_text, "w") as f:
        for text in texts: