mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
Fix TIMIT lexicon generation bug (#456)
This commit is contained in:
parent
d80f29e662
commit
ac9fe5342b
@ -58,15 +58,19 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
|
|||||||
Return:
|
Return:
|
||||||
The lexicon.txt file and the train.text in lang_dir.
|
The lexicon.txt file and the train.text in lang_dir.
|
||||||
"""
|
"""
|
||||||
|
import gzip
|
||||||
|
|
||||||
phones = set()
|
phones = set()
|
||||||
|
|
||||||
supervisions_train = Path(manifests_dir) / "supervisions_TRAIN.json"
|
supervisions_train = (
|
||||||
|
Path(manifests_dir) / "timit_supervisions_TRAIN.jsonl.gz"
|
||||||
|
)
|
||||||
lexicon = Path(lang_dir) / "lexicon.txt"
|
lexicon = Path(lang_dir) / "lexicon.txt"
|
||||||
|
|
||||||
logging.info(f"Loading {supervisions_train}!")
|
logging.info(f"Loading {supervisions_train}!")
|
||||||
with open(supervisions_train, "r") as load_f:
|
with gzip.open(supervisions_train, "r") as load_f:
|
||||||
load_dicts = json.load(load_f)
|
for line in load_f.readlines():
|
||||||
for load_dict in load_dicts:
|
load_dict = json.loads(line)
|
||||||
text = load_dict["text"]
|
text = load_dict["text"]
|
||||||
# list the phone units and filter the empty item
|
# list the phone units and filter the empty item
|
||||||
phones_list = list(filter(None, text.split()))
|
phones_list = list(filter(None, text.split()))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user