Fix problem with generating lexicon

The manifest file is in .jsonl.gz format, not in json format.
2025-09-18 21:44:18 +00:00 · 2022-06-30 17:32:32 +08:00 · 2022-06-30 17:32:32 +08:00 · d550125fc1
commit d550125fc1
parent 662b9c2e2e
1 changed files with 5 additions and 4 deletions
--- a/egs/timit/ASR/local/prepare_lexicon.py
+++ b/egs/timit/ASR/local/prepare_lexicon.py
@ -58,15 +58,16 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
    Return:
      The lexicon.txt file and the train.text in lang_dir.
    """
    import gzip
    phones = set()
-    supervisions_train = Path(manifests_dir) / "supervisions_TRAIN.json"
+    supervisions_train = Path(manifests_dir) / "timit_supervisions_TRAIN.jsonl.gz"
    lexicon = Path(lang_dir) / "lexicon.txt"
    logging.info(f"Loading {supervisions_train}!")
-    with open(supervisions_train, "r") as load_f:
+    with gzip.open(supervisions_train, "r") as load_f:
-        load_dicts = json.load(load_f)
+        for line in load_f.readlines():
-        for load_dict in load_dicts:
+            load_dict = json.loads(line)
            text = load_dict["text"]
            # list the phone units and filter the empty item
            phones_list = list(filter(None, text.split()))