Fix TIMIT lexicon generation bug (#456)

2025-12-11 06:55:27 +00:00 · 2022-06-30 19:13:46 +08:00 · 2022-06-30 19:13:46 +08:00 · ac9fe5342b
commit ac9fe5342b
parent d80f29e662
1 changed files with 8 additions and 4 deletions
--- a/egs/timit/ASR/local/prepare_lexicon.py
+++ b/egs/timit/ASR/local/prepare_lexicon.py
@ -58,15 +58,19 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
    Return:
      The lexicon.txt file and the train.text in lang_dir.
    """
    import gzip
    phones = set()
-    supervisions_train = Path(manifests_dir) / "supervisions_TRAIN.json"
+    supervisions_train = (
        Path(manifests_dir) / "timit_supervisions_TRAIN.jsonl.gz"
    )
    lexicon = Path(lang_dir) / "lexicon.txt"
    logging.info(f"Loading {supervisions_train}!")
-    with open(supervisions_train, "r") as load_f:
+    with gzip.open(supervisions_train, "r") as load_f:
-        load_dicts = json.load(load_f)
+        for line in load_f.readlines():
-        for load_dict in load_dicts:
+            load_dict = json.loads(line)
            text = load_dict["text"]
            # list the phone units and filter the empty item
            phones_list = list(filter(None, text.split()))