mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-11 19:12:30 +00:00
78 lines
2.1 KiB
Python
78 lines
2.1 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
|
|
#
|
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
from lhotse import CutSet
|
|
from asr_datamodule import MLSEnglishHFAsrDataModule
|
|
|
|
from tqdm import tqdm
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser(
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
)
|
|
|
|
# parser.add_argument(
|
|
# "train_cut", metavar="train-cut", type=Path, help="Path to the train cut"
|
|
# )
|
|
|
|
parser.add_argument(
|
|
"--lang-dir",
|
|
type=Path,
|
|
default=Path("data/lang"),
|
|
help=(
|
|
"Name of lang dir. "
|
|
"If not set, this will default to data/lang"
|
|
),
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = get_args()
|
|
logging.basicConfig(
|
|
format=("%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"),
|
|
level=logging.INFO,
|
|
)
|
|
|
|
|
|
mls_english_corpus = MLSEnglishHFAsrDataModule(args)
|
|
mls_english_corpus.load_hf_dataset("/root/datasets/parler-tts--mls_eng")
|
|
|
|
train_cuts = mls_english_corpus.train_cuts()
|
|
|
|
logging.info(f"Creating transcript from MLS English train cut.")
|
|
|
|
def generate_text(train_cuts):
|
|
for cut in tqdm(train_cuts):
|
|
for sup in cut.supervisions:
|
|
yield sup.text + "\n"
|
|
|
|
with open(args.lang_dir / "transcript.txt", "w") as file:
|
|
file.writelines(generate_text(train_cuts))
|
|
|
|
logging.info("Done.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|