mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Modified prepare_transcripts.py and preprare_lexicon.py of tedlium3 recipe (#567)
This commit is contained in:
parent
e18fa78c3a
commit
9e24642faf
@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
|
||||
1. Generate lexicon_words.txt.
|
||||
|
||||
"""
|
||||
import lhotse
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
@ -60,20 +60,17 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
|
||||
"""
|
||||
words = set()
|
||||
|
||||
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
|
||||
lexicon = Path(lang_dir) / "lexicon_words.txt"
|
||||
sups = lhotse.load_manifest(
|
||||
f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
|
||||
)
|
||||
for s in sups:
|
||||
# list the words units and filter the empty item
|
||||
words_list = list(filter(None, s.text.split()))
|
||||
|
||||
logging.info(f"Loading {supervisions_train}!")
|
||||
with open(supervisions_train, "r") as load_f:
|
||||
load_dicts = json.load(load_f)
|
||||
for load_dict in load_dicts:
|
||||
text = load_dict["text"]
|
||||
# list the words units and filter the empty item
|
||||
words_list = list(filter(None, text.split()))
|
||||
|
||||
for word in words_list:
|
||||
if word not in words and word != "<unk>":
|
||||
words.add(word)
|
||||
for word in words_list:
|
||||
if word not in words and word != "<unk>":
|
||||
words.add(word)
|
||||
|
||||
with open(lexicon, "w") as f:
|
||||
for word in sorted(words):
|
||||
|
@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
|
||||
1. Generate train.text.
|
||||
|
||||
"""
|
||||
import lhotse
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
@ -60,15 +60,12 @@ def prepare_transcripts(manifests_dir: str, lang_dir: str):
|
||||
"""
|
||||
texts = []
|
||||
|
||||
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
|
||||
train_text = Path(lang_dir) / "train.text"
|
||||
|
||||
logging.info(f"Loading {supervisions_train}!")
|
||||
with open(supervisions_train, "r") as load_f:
|
||||
load_dicts = json.load(load_f)
|
||||
for load_dict in load_dicts:
|
||||
text = load_dict["text"]
|
||||
texts.append(text)
|
||||
sups = lhotse.load_manifest(
|
||||
f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
|
||||
)
|
||||
for s in sups:
|
||||
texts.append(s.text)
|
||||
|
||||
with open(train_text, "w") as f:
|
||||
for text in texts:
|
||||
|
Loading…
x
Reference in New Issue
Block a user