Fix TIMIT lexicon generation bug (#456)

This commit is contained in:
Tiance Wang 2022-06-30 19:13:46 +08:00 committed by GitHub
parent d80f29e662
commit ac9fe5342b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -58,15 +58,19 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
Return: Return:
The lexicon.txt file and the train.text in lang_dir. The lexicon.txt file and the train.text in lang_dir.
""" """
import gzip
phones = set() phones = set()
supervisions_train = Path(manifests_dir) / "supervisions_TRAIN.json" supervisions_train = (
Path(manifests_dir) / "timit_supervisions_TRAIN.jsonl.gz"
)
lexicon = Path(lang_dir) / "lexicon.txt" lexicon = Path(lang_dir) / "lexicon.txt"
logging.info(f"Loading {supervisions_train}!") logging.info(f"Loading {supervisions_train}!")
with open(supervisions_train, "r") as load_f: with gzip.open(supervisions_train, "r") as load_f:
load_dicts = json.load(load_f) for line in load_f.readlines():
for load_dict in load_dicts: load_dict = json.loads(line)
text = load_dict["text"] text = load_dict["text"]
# list the phone units and filter the empty item # list the phone units and filter the empty item
phones_list = list(filter(None, text.split())) phones_list = list(filter(None, text.split()))