mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
191 lines
5.8 KiB
Python
Executable File
191 lines
5.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright 2022 Xiaomi Corp. (authors: Zengwei Yao)
|
|
#
|
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
"""
|
|
This file adds alignments from https://github.com/CorentinJ/librispeech-alignments # noqa
|
|
to the existing fbank features dir (e.g., data/fbank)
|
|
and save cuts to a new dir (e.g., data/fbank_ali).
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import zipfile
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
from lhotse import CutSet, load_manifest_lazy
|
|
from lhotse.recipes.librispeech import parse_alignments
|
|
from lhotse.utils import is_module_available
|
|
|
|
LIBRISPEECH_ALIGNMENTS_URL = (
|
|
"https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
|
|
)
|
|
|
|
DATASET_PARTS = [
|
|
"dev-clean",
|
|
"dev-other",
|
|
"test-clean",
|
|
"test-other",
|
|
"train-clean-100",
|
|
"train-clean-360",
|
|
"train-other-500",
|
|
]
|
|
|
|
|
|
def get_parser():
|
|
parser = argparse.ArgumentParser(
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--alignments-dir",
|
|
type=str,
|
|
default="data/alignment",
|
|
help="The dir to save alignments.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--cuts-in-dir",
|
|
type=str,
|
|
default="data/fbank",
|
|
help="The dir of the existing cuts without alignments.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--cuts-out-dir",
|
|
type=str,
|
|
default="data/fbank_ali",
|
|
help="The dir to save the new cuts with alignments",
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
def download_alignments(
|
|
target_dir: str, alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL
|
|
):
|
|
"""
|
|
Download and extract the alignments.
|
|
|
|
Note: If you can not access drive.google.com, you could download the file
|
|
`LibriSpeech-Alignments.zip` from huggingface:
|
|
https://huggingface.co/Zengwei/librispeech-alignments
|
|
and extract the zip file manually.
|
|
|
|
Args:
|
|
target_dir:
|
|
The dir to save alignments.
|
|
alignments_url:
|
|
The URL of alignments.
|
|
"""
|
|
"""Modified from https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/librispeech.py""" # noqa
|
|
target_dir = Path(target_dir)
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
completed_detector = target_dir / ".ali_completed"
|
|
if completed_detector.is_file():
|
|
logging.info("The alignment files already exist.")
|
|
return
|
|
|
|
ali_zip_path = target_dir / "LibriSpeech-Alignments.zip"
|
|
if not ali_zip_path.is_file():
|
|
assert is_module_available(
|
|
"gdown"
|
|
), 'To download LibriSpeech alignments, please install "pip install gdown"' # noqa
|
|
import gdown
|
|
|
|
gdown.download(alignments_url, output=str(ali_zip_path))
|
|
|
|
with zipfile.ZipFile(str(ali_zip_path)) as f:
|
|
f.extractall(path=target_dir)
|
|
completed_detector.touch()
|
|
|
|
|
|
def add_alignment(
|
|
alignments_dir: str,
|
|
cuts_in_dir: str = "data/fbank",
|
|
cuts_out_dir: str = "data/fbank_ali",
|
|
dataset_parts: List[str] = DATASET_PARTS,
|
|
):
|
|
"""
|
|
Add alignment info to existing cuts.
|
|
|
|
Args:
|
|
alignments_dir:
|
|
The dir of the alignments.
|
|
cuts_in_dir:
|
|
The dir of the existing cuts.
|
|
cuts_out_dir:
|
|
The dir to save the new cuts with alignments.
|
|
dataset_parts:
|
|
Librispeech parts to add alignments.
|
|
"""
|
|
alignments_dir = Path(alignments_dir)
|
|
cuts_in_dir = Path(cuts_in_dir)
|
|
cuts_out_dir = Path(cuts_out_dir)
|
|
cuts_out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
for part in dataset_parts:
|
|
logging.info(f"Processing {part}")
|
|
|
|
cuts_in_path = cuts_in_dir / f"librispeech_cuts_{part}.jsonl.gz"
|
|
if not cuts_in_path.is_file():
|
|
logging.info(f"{cuts_in_path} does not exist - skipping.")
|
|
continue
|
|
cuts_out_path = cuts_out_dir / f"librispeech_cuts_{part}.jsonl.gz"
|
|
if cuts_out_path.is_file():
|
|
logging.info(f"{part} already exists - skipping.")
|
|
continue
|
|
|
|
# parse alignments
|
|
alignments = {}
|
|
part_ali_dir = alignments_dir / "LibriSpeech" / part
|
|
for ali_path in part_ali_dir.rglob("*.alignment.txt"):
|
|
ali = parse_alignments(ali_path)
|
|
alignments.update(ali)
|
|
logging.info(f"{part} has {len(alignments.keys())} cuts with alignments.")
|
|
|
|
# add alignment attribute and write out
|
|
cuts_in = load_manifest_lazy(cuts_in_path)
|
|
with CutSet.open_writer(cuts_out_path) as writer:
|
|
for cut in cuts_in:
|
|
for idx, subcut in enumerate(cut.supervisions):
|
|
origin_id = subcut.id.split("_")[0]
|
|
if origin_id in alignments:
|
|
ali = alignments[origin_id]
|
|
else:
|
|
logging.info(f"Warning: {origin_id} does not have alignment.")
|
|
ali = []
|
|
subcut.alignment = {"word": ali}
|
|
writer.write(cut, flush=True)
|
|
|
|
|
|
def main():
|
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
|
|
parser = get_parser()
|
|
args = parser.parse_args()
|
|
logging.info(vars(args))
|
|
|
|
download_alignments(args.alignments_dir)
|
|
add_alignment(args.alignments_dir, args.cuts_in_dir, args.cuts_out_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|