Modified prepare_transcripts.py and preprare_lexicon.py of tedlium3 recipe (#567)

This commit is contained in:
shcxlee 2022-09-09 21:32:49 -05:00 committed by GitHub
parent e18fa78c3a
commit 9e24642faf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 22 deletions

View File

@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
1. Generate lexicon_words.txt.
"""
import lhotse
import argparse
import json
import logging
from pathlib import Path
@ -60,20 +60,17 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
"""
words = set()
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
lexicon = Path(lang_dir) / "lexicon_words.txt"
sups = lhotse.load_manifest(
f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
)
for s in sups:
# list the words units and filter the empty item
words_list = list(filter(None, s.text.split()))
logging.info(f"Loading {supervisions_train}!")
with open(supervisions_train, "r") as load_f:
load_dicts = json.load(load_f)
for load_dict in load_dicts:
text = load_dict["text"]
# list the words units and filter the empty item
words_list = list(filter(None, text.split()))
for word in words_list:
if word not in words and word != "<unk>":
words.add(word)
for word in words_list:
if word not in words and word != "<unk>":
words.add(word)
with open(lexicon, "w") as f:
for word in sorted(words):

View File

@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
1. Generate train.text.
"""
import lhotse
import argparse
import json
import logging
from pathlib import Path
@ -60,15 +60,12 @@ def prepare_transcripts(manifests_dir: str, lang_dir: str):
"""
texts = []
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
train_text = Path(lang_dir) / "train.text"
logging.info(f"Loading {supervisions_train}!")
with open(supervisions_train, "r") as load_f:
load_dicts = json.load(load_f)
for load_dict in load_dicts:
text = load_dict["text"]
texts.append(text)
sups = lhotse.load_manifest(
f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
)
for s in sups:
texts.append(s.text)
with open(train_text, "w") as f:
for text in texts: