icefall/egs/librispeech/ASR/local/download_lm.py

#!/usr/bin/env python3
# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
This file downloads the following LibriSpeech LM files:

    - 3-gram.pruned.1e-7.arpa.gz
    - 4-gram.arpa.gz
    - librispeech-vocab.txt
    - librispeech-lexicon.txt
    - librispeech-lm-norm.txt.gz

from http://www.openslr.org/resources/11
and save them in the user provided directory.

Files are not re-downloaded if they already exist.

Usage:
    ./local/download_lm.py --out-dir ./download/lm
"""

import argparse
import gzip
import logging
import os
import shutil
from pathlib import Path

from tqdm.auto import tqdm


# This function is copied from lhotse
def tqdm_urlretrieve_hook(t):
    """Wraps tqdm instance.
    Don't forget to close() or __exit__()
    the tqdm instance once you're done with it (easiest using `with` syntax).
    Example
    -------
    >>> from urllib.request import urlretrieve
    >>> with tqdm(...) as t:
    ...     reporthook = tqdm_urlretrieve_hook(t)
    ...     urlretrieve(..., reporthook=reporthook)

    Source: https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py
    """
    last_b = [0]

    def update_to(b=1, bsize=1, tsize=None):
        """
        b  : int, optional
            Number of blocks transferred so far [default: 1].
        bsize  : int, optional
            Size of each block (in tqdm units) [default: 1].
        tsize  : int, optional
            Total size (in tqdm units). If [default: None] or -1,
            remains unchanged.
        """
        if tsize not in (None, -1):
            t.total = tsize
        displayed = t.update((b - last_b[0]) * bsize)
        last_b[0] = b
        return displayed

    return update_to


# This function is copied from lhotse
def urlretrieve_progress(url, filename=None, data=None, desc=None):
    """
    Works exactly like urllib.request.urlretrieve, but attaches a tqdm hook to
    display a progress bar of the download.
    Use "desc" argument to display a user-readable string that informs what is
    being downloaded.
    """
    from urllib.request import urlretrieve

    with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=desc) as t:
        reporthook = tqdm_urlretrieve_hook(t)
        return urlretrieve(url=url, filename=filename, reporthook=reporthook, data=data)


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--out-dir", type=str, help="Output directory.")

    args = parser.parse_args()
    return args


def main(out_dir: str):
    url = "http://www.openslr.org/resources/11"
    out_dir = Path(out_dir)

    files_to_download = (
        "3-gram.pruned.1e-7.arpa.gz",
        "4-gram.arpa.gz",
        "librispeech-vocab.txt",
        "librispeech-lexicon.txt",
        "librispeech-lm-norm.txt.gz",
    )

    for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
        filename = out_dir / f
        if filename.is_file() is False:
            urlretrieve_progress(
                f"{url}/{f}",
                filename=filename,
                desc=f"Downloading {filename}",
            )
        else:
            logging.info(f"{filename} already exists - skipping")

        if ".gz" in str(filename):
            unzipped = Path(os.path.splitext(filename)[0])
            if unzipped.is_file() is False:
                with gzip.open(filename, "rb") as f_in:
                    with open(unzipped, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
            else:
                logging.info(f"{unzipped} already exist - skipping")


if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

    logging.basicConfig(format=formatter, level=logging.INFO)

    args = get_args()
    logging.info(f"out_dir: {args.out_dir}")

    main(out_dir=args.out_dir)