Modified prepare_transcripts.py and preprare_lexicon.py of tedlium3 recipe (#567)

This commit is contained in:
shcxlee 2022-09-09 21:32:49 -05:00 committed by GitHub
parent e18fa78c3a
commit 9e24642faf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 22 deletions

View File

@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
1. Generate lexicon_words.txt. 1. Generate lexicon_words.txt.
""" """
import lhotse
import argparse import argparse
import json
import logging import logging
from pathlib import Path from pathlib import Path
@ -60,20 +60,17 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
""" """
words = set() words = set()
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
lexicon = Path(lang_dir) / "lexicon_words.txt" lexicon = Path(lang_dir) / "lexicon_words.txt"
sups = lhotse.load_manifest(
f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
)
for s in sups:
# list the words units and filter the empty item
words_list = list(filter(None, s.text.split()))
logging.info(f"Loading {supervisions_train}!") for word in words_list:
with open(supervisions_train, "r") as load_f: if word not in words and word != "<unk>":
load_dicts = json.load(load_f) words.add(word)
for load_dict in load_dicts:
text = load_dict["text"]
# list the words units and filter the empty item
words_list = list(filter(None, text.split()))
for word in words_list:
if word not in words and word != "<unk>":
words.add(word)
with open(lexicon, "w") as f: with open(lexicon, "w") as f:
for word in sorted(words): for word in sorted(words):

View File

@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
1. Generate train.text. 1. Generate train.text.
""" """
import lhotse
import argparse import argparse
import json
import logging import logging
from pathlib import Path from pathlib import Path
@ -60,15 +60,12 @@ def prepare_transcripts(manifests_dir: str, lang_dir: str):
""" """
texts = [] texts = []
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
train_text = Path(lang_dir) / "train.text" train_text = Path(lang_dir) / "train.text"
sups = lhotse.load_manifest(
logging.info(f"Loading {supervisions_train}!") f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
with open(supervisions_train, "r") as load_f: )
load_dicts = json.load(load_f) for s in sups:
for load_dict in load_dicts: texts.append(s.text)
text = load_dict["text"]
texts.append(text)
with open(train_text, "w") as f: with open(train_text, "w") as f:
for text in texts: for text in texts: