mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
Modified prepare_transcripts.py and preprare_lexicon.py of tedlium3 recipe (#567)
This commit is contained in:
parent
e18fa78c3a
commit
9e24642faf
@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
|
|||||||
1. Generate lexicon_words.txt.
|
1. Generate lexicon_words.txt.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
import lhotse
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -60,16 +60,13 @@ def prepare_lexicon(manifests_dir: str, lang_dir: str):
|
|||||||
"""
|
"""
|
||||||
words = set()
|
words = set()
|
||||||
|
|
||||||
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
|
|
||||||
lexicon = Path(lang_dir) / "lexicon_words.txt"
|
lexicon = Path(lang_dir) / "lexicon_words.txt"
|
||||||
|
sups = lhotse.load_manifest(
|
||||||
logging.info(f"Loading {supervisions_train}!")
|
f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
|
||||||
with open(supervisions_train, "r") as load_f:
|
)
|
||||||
load_dicts = json.load(load_f)
|
for s in sups:
|
||||||
for load_dict in load_dicts:
|
|
||||||
text = load_dict["text"]
|
|
||||||
# list the words units and filter the empty item
|
# list the words units and filter the empty item
|
||||||
words_list = list(filter(None, text.split()))
|
words_list = list(filter(None, s.text.split()))
|
||||||
|
|
||||||
for word in words_list:
|
for word in words_list:
|
||||||
if word not in words and word != "<unk>":
|
if word not in words and word != "<unk>":
|
||||||
|
@ -23,8 +23,8 @@ consisting of supervisions_train.json and does the following:
|
|||||||
1. Generate train.text.
|
1. Generate train.text.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
import lhotse
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -60,15 +60,12 @@ def prepare_transcripts(manifests_dir: str, lang_dir: str):
|
|||||||
"""
|
"""
|
||||||
texts = []
|
texts = []
|
||||||
|
|
||||||
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
|
|
||||||
train_text = Path(lang_dir) / "train.text"
|
train_text = Path(lang_dir) / "train.text"
|
||||||
|
sups = lhotse.load_manifest(
|
||||||
logging.info(f"Loading {supervisions_train}!")
|
f"{manifests_dir}/tedlium_supervisions_train.jsonl.gz"
|
||||||
with open(supervisions_train, "r") as load_f:
|
)
|
||||||
load_dicts = json.load(load_f)
|
for s in sups:
|
||||||
for load_dict in load_dicts:
|
texts.append(s.text)
|
||||||
text = load_dict["text"]
|
|
||||||
texts.append(text)
|
|
||||||
|
|
||||||
with open(train_text, "w") as f:
|
with open(train_text, "w") as f:
|
||||||
for text in texts:
|
for text in texts:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user