icefall/egs/librispeech/ASR/local/add_alignment_librispeech.py

#!/usr/bin/env python3
# Copyright    2022  Xiaomi Corp.        (authors: Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
This file adds alignments from https://github.com/CorentinJ/librispeech-alignments  # noqa
to the existing fbank features dir (e.g., data/fbank)
and save cuts to a new dir (e.g., data/fbank_ali).
"""

import argparse
import logging
import zipfile
from pathlib import Path
from typing import List

from lhotse import CutSet, load_manifest_lazy
from lhotse.recipes.librispeech import parse_alignments
from lhotse.utils import is_module_available

LIBRISPEECH_ALIGNMENTS_URL = (
    "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
)

DATASET_PARTS = [
    "dev-clean",
    "dev-other",
    "test-clean",
    "test-other",
    "train-clean-100",
    "train-clean-360",
    "train-other-500",
]


def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument(
        "--alignments-dir",
        type=str,
        default="data/alignment",
        help="The dir to save alignments.",
    )

    parser.add_argument(
        "--cuts-in-dir",
        type=str,
        default="data/fbank",
        help="The dir of the existing cuts without alignments.",
    )

    parser.add_argument(
        "--cuts-out-dir",
        type=str,
        default="data/fbank_ali",
        help="The dir to save the new cuts with alignments",
    )

    return parser


def download_alignments(
    target_dir: str, alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL
):
    """
    Download and extract the alignments.

    Note: If you can not access drive.google.com, you could download the file
    `LibriSpeech-Alignments.zip` from huggingface:
    https://huggingface.co/Zengwei/librispeech-alignments
    and extract the zip file manually.

    Args:
      target_dir:
        The dir to save alignments.
      alignments_url:
        The URL of alignments.
    """
    """Modified from https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/librispeech.py"""  # noqa
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    completed_detector = target_dir / ".ali_completed"
    if completed_detector.is_file():
        logging.info("The alignment files already exist.")
        return

    ali_zip_path = target_dir / "LibriSpeech-Alignments.zip"
    if not ali_zip_path.is_file():
        assert is_module_available(
            "gdown"
        ), 'To download LibriSpeech alignments, please install "pip install gdown"'  # noqa
        import gdown

        gdown.download(alignments_url, output=str(ali_zip_path))

    with zipfile.ZipFile(str(ali_zip_path)) as f:
        f.extractall(path=target_dir)
        completed_detector.touch()


def add_alignment(
    alignments_dir: str,
    cuts_in_dir: str = "data/fbank",
    cuts_out_dir: str = "data/fbank_ali",
    dataset_parts: List[str] = DATASET_PARTS,
):
    """
    Add alignment info to existing cuts.

    Args:
      alignments_dir:
        The dir of the alignments.
      cuts_in_dir:
        The dir of the existing cuts.
      cuts_out_dir:
        The dir to save the new cuts with alignments.
      dataset_parts:
        Librispeech parts to add alignments.
    """
    alignments_dir = Path(alignments_dir)
    cuts_in_dir = Path(cuts_in_dir)
    cuts_out_dir = Path(cuts_out_dir)
    cuts_out_dir.mkdir(parents=True, exist_ok=True)

    for part in dataset_parts:
        logging.info(f"Processing {part}")

        cuts_in_path = cuts_in_dir / f"librispeech_cuts_{part}.jsonl.gz"
        if not cuts_in_path.is_file():
            logging.info(f"{cuts_in_path} does not exist - skipping.")
            continue
        cuts_out_path = cuts_out_dir / f"librispeech_cuts_{part}.jsonl.gz"
        if cuts_out_path.is_file():
            logging.info(f"{part} already exists - skipping.")
            continue

        # parse alignments
        alignments = {}
        part_ali_dir = alignments_dir / "LibriSpeech" / part
        for ali_path in part_ali_dir.rglob("*.alignment.txt"):
            ali = parse_alignments(ali_path)
            alignments.update(ali)
        logging.info(f"{part} has {len(alignments.keys())} cuts with alignments.")

        # add alignment attribute and write out
        cuts_in = load_manifest_lazy(cuts_in_path)
        with CutSet.open_writer(cuts_out_path) as writer:
            for cut in cuts_in:
                for idx, subcut in enumerate(cut.supervisions):
                    origin_id = subcut.id.split("_")[0]
                    if origin_id in alignments:
                        ali = alignments[origin_id]
                    else:
                        logging.info(f"Warning: {origin_id} does not have alignment.")
                        ali = []
                    subcut.alignment = {"word": ali}
                writer.write(cut, flush=True)


def main():
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)

    parser = get_parser()
    args = parser.parse_args()
    logging.info(vars(args))

    download_alignments(args.alignments_dir)
    add_alignment(args.alignments_dir, args.cuts_in_dir, args.cuts_out_dir)


if __name__ == "__main__":
    main()