From ac9fe5342b5bcf2dbba4c4fbb85fdf49920e6dea Mon Sep 17 00:00:00 2001 From: Tiance Wang Date: Thu, 30 Jun 2022 19:13:46 +0800 Subject: [PATCH] Fix TIMIT lexicon generation bug (#456) --- egs/timit/ASR/local/prepare_lexicon.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/egs/timit/ASR/local/prepare_lexicon.py b/egs/timit/ASR/local/prepare_lexicon.py index f0168ebd6..04023a9ab 100644 --- a/egs/timit/ASR/local/prepare_lexicon.py +++ b/egs/timit/ASR/local/prepare_lexicon.py @@ -58,15 +58,19 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str): Return: The lexicon.txt file and the train.text in lang_dir. """ + import gzip + phones = set() - supervisions_train = Path(manifests_dir) / "supervisions_TRAIN.json" + supervisions_train = ( + Path(manifests_dir) / "timit_supervisions_TRAIN.jsonl.gz" + ) lexicon = Path(lang_dir) / "lexicon.txt" logging.info(f"Loading {supervisions_train}!") - with open(supervisions_train, "r") as load_f: - load_dicts = json.load(load_f) - for load_dict in load_dicts: + with gzip.open(supervisions_train, "r") as load_f: + for line in load_f.readlines(): + load_dict = json.loads(line) text = load_dict["text"] # list the phone units and filter the empty item phones_list = list(filter(None, text.split()))