diff --git a/egs/tedlium3/ASR/local/prepare_lexicon.py b/egs/tedlium3/ASR/local/prepare_lexicon.py index 59377b5aa..35dd332e8 100755 --- a/egs/tedlium3/ASR/local/prepare_lexicon.py +++ b/egs/tedlium3/ASR/local/prepare_lexicon.py @@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following: 1. Generate lexicon_words.txt. """ +import lhotse import argparse -import json import logging from pathlib import Path @@ -60,20 +60,17 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str): """ words = set() - supervisions_train = Path(manifests_dir) / "supervisions_train.json" lexicon = Path(lang_dir) / "lexicon_words.txt" + sups = lhotse.load_manifest( + f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz" + ) + for s in sups: + # list the words units and filter the empty item + words_list = list(filter(None, s.text.split())) - logging.info(f"Loading {supervisions_train}!") - with open(supervisions_train, "r") as load_f: - load_dicts = json.load(load_f) - for load_dict in load_dicts: - text = load_dict["text"] - # list the words units and filter the empty item - words_list = list(filter(None, text.split())) - - for word in words_list: - if word not in words and word != "": - words.add(word) + for word in words_list: + if word not in words and word != "": + words.add(word) with open(lexicon, "w") as f: for word in sorted(words): diff --git a/egs/tedlium3/ASR/local/prepare_transcripts.py b/egs/tedlium3/ASR/local/prepare_transcripts.py index 416264ea0..1039ac5bb 100755 --- a/egs/tedlium3/ASR/local/prepare_transcripts.py +++ b/egs/tedlium3/ASR/local/prepare_transcripts.py @@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following: 1. Generate train.text. """ +import lhotse import argparse -import json import logging from pathlib import Path @@ -60,15 +60,12 @@ def prepare_transcripts(manifests_dir: str, lang_dir: str): """ texts = [] - supervisions_train = Path(manifests_dir) / "supervisions_train.json" train_text = Path(lang_dir) / "train.text" - - logging.info(f"Loading {supervisions_train}!") - with open(supervisions_train, "r") as load_f: - load_dicts = json.load(load_f) - for load_dict in load_dicts: - text = load_dict["text"] - texts.append(text) + sups = lhotse.load_manifest( + f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz" + ) + for s in sups: + texts.append(s.text) with open(train_text, "w") as f: for text in texts: