icefall/egs/mls_english/ASR/local/utils/generate_transcript.py
2025-04-16 08:05:05 +09:00

92 lines
2.7 KiB
Python

#!/usr/bin/env python3
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
from typing import Optional
from lhotse import CutSet
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser(
description="Generate transcripts for BPE training from MLS English dataset",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--dataset-path",
type=str,
default="parler-tts/mls_eng",
help="Path to HuggingFace MLS English dataset (name or local path)",
)
parser.add_argument(
"--lang-dir",
type=Path,
default=Path("data/lang"),
help="Directory to store output transcripts",
)
parser.add_argument(
"--split",
type=str,
default="train",
help="Dataset split to use for generating transcripts (train/dev/test)",
)
return parser.parse_args()
def generate_transcript_from_cuts(cuts: CutSet, output_file: Path) -> None:
"""Generate transcript text file from Lhotse CutSet."""
with open(output_file, "w") as f:
for cut in tqdm(cuts, desc="Processing cuts"):
for sup in cut.supervisions:
f.write(f"{sup.text}\n")
def main():
args = get_args()
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
level=logging.INFO,
)
args.lang_dir.mkdir(parents=True, exist_ok=True)
output_file = args.lang_dir / "transcript.txt"
logging.info(f"Loading {args.split} split from dataset: {args.dataset_path}")
try:
cuts = CutSet.from_huggingface_dataset(
args.dataset_path, split=args.split, text_key="transcript"
)
except Exception as e:
logging.error(f"Failed to load dataset: {e}")
raise
logging.info(f"Generating transcript to {output_file}")
generate_transcript_from_cuts(cuts, output_file)
logging.info("Transcript generation completed")
if __name__ == "__main__":
main()