mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-18 21:44:18 +00:00
Fix problem with generating lexicon
The manifest file is in .jsonl.gz format, not in json format.
This commit is contained in:
parent
662b9c2e2e
commit
d550125fc1
@ -58,15 +58,16 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
|
|||||||
Return:
|
Return:
|
||||||
The lexicon.txt file and the train.text in lang_dir.
|
The lexicon.txt file and the train.text in lang_dir.
|
||||||
"""
|
"""
|
||||||
|
import gzip
|
||||||
phones = set()
|
phones = set()
|
||||||
|
|
||||||
supervisions_train = Path(manifests_dir) / "supervisions_TRAIN.json"
|
supervisions_train = Path(manifests_dir) / "timit_supervisions_TRAIN.jsonl.gz"
|
||||||
lexicon = Path(lang_dir) / "lexicon.txt"
|
lexicon = Path(lang_dir) / "lexicon.txt"
|
||||||
|
|
||||||
logging.info(f"Loading {supervisions_train}!")
|
logging.info(f"Loading {supervisions_train}!")
|
||||||
with open(supervisions_train, "r") as load_f:
|
with gzip.open(supervisions_train, "r") as load_f:
|
||||||
load_dicts = json.load(load_f)
|
for line in load_f.readlines():
|
||||||
for load_dict in load_dicts:
|
load_dict = json.loads(line)
|
||||||
text = load_dict["text"]
|
text = load_dict["text"]
|
||||||
# list the phone units and filter the empty item
|
# list the phone units and filter the empty item
|
||||||
phones_list = list(filter(None, text.split()))
|
phones_list = list(filter(None, text.split()))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user