icefall/egs/mls_english/ASR/local/utils/generate_transcript.py
2025-04-11 10:30:08 +09:00

78 lines
2.1 KiB
Python

#!/usr/bin/env python3
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
from lhotse import CutSet
from asr_datamodule import MLSEnglishHFAsrDataModule
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# parser.add_argument(
# "train_cut", metavar="train-cut", type=Path, help="Path to the train cut"
# )
parser.add_argument(
"--lang-dir",
type=Path,
default=Path("data/lang"),
help=(
"Name of lang dir. "
"If not set, this will default to data/lang"
),
)
return parser.parse_args()
def main():
args = get_args()
logging.basicConfig(
format=("%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"),
level=logging.INFO,
)
mls_english_corpus = MLSEnglishHFAsrDataModule(args)
mls_english_corpus.load_hf_dataset("/root/datasets/parler-tts--mls_eng")
train_cuts = mls_english_corpus.train_cuts()
logging.info(f"Creating transcript from MLS English train cut.")
def generate_text(train_cuts):
for cut in tqdm(train_cuts):
for sup in cut.supervisions:
yield sup.text + "\n"
with open(args.lang_dir / "transcript.txt", "w") as file:
file.writelines(generate_text(train_cuts))
logging.info("Done.")
if __name__ == "__main__":
main()