icefall/egs/seame/ASR/local/prepare_transcripts.py
2024-04-05 09:58:02 -04:00

54 lines
1.1 KiB
Python
Executable File

# Copyright 2023 Johns Hopkins University (Amir Hussein)
#!/usr/bin/python
"""
This script prepares transcript_words.txt from cutset
"""
from lhotse import CutSet
import argparse
import logging
import pdb
from pathlib import Path
import os
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--cut",
type=str,
default="",
help="Cutset file",
)
parser.add_argument(
"--langdir",
type=str,
default="",
help="name of the lang-dir",
)
return parser
def main():
parser = get_parser()
args = parser.parse_args()
logging.info("Reading the cuts")
cuts = CutSet.from_file(args.cut)
langdir = Path(args.langdir)
if not os.path.exists(langdir):
os.makedirs(langdir)
with open(langdir / "transcript_words.txt", 'w') as txt:
for c in cuts:
#breakpoint()
text = c.supervisions[0].text
txt.write(text + '\n')
if __name__ == "__main__":
main()