mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
147 lines
4.4 KiB
Python
Executable File
147 lines
4.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
#
|
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
"""
|
|
This file downloads the following LibriSpeech LM files:
|
|
|
|
- 3-gram.pruned.1e-7.arpa.gz
|
|
- 4-gram.arpa.gz
|
|
- librispeech-vocab.txt
|
|
- librispeech-lexicon.txt
|
|
- librispeech-lm-norm.txt.gz
|
|
|
|
from http://www.openslr.org/resources/11
|
|
and save them in the user provided directory.
|
|
|
|
Files are not re-downloaded if they already exist.
|
|
|
|
Usage:
|
|
./local/download_lm.py --out-dir ./download/lm
|
|
"""
|
|
|
|
import argparse
|
|
import gzip
|
|
import logging
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from tqdm.auto import tqdm
|
|
|
|
|
|
# This function is copied from lhotse
|
|
def tqdm_urlretrieve_hook(t):
|
|
"""Wraps tqdm instance.
|
|
Don't forget to close() or __exit__()
|
|
the tqdm instance once you're done with it (easiest using `with` syntax).
|
|
Example
|
|
-------
|
|
>>> from urllib.request import urlretrieve
|
|
>>> with tqdm(...) as t:
|
|
... reporthook = tqdm_urlretrieve_hook(t)
|
|
... urlretrieve(..., reporthook=reporthook)
|
|
|
|
Source: https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py
|
|
"""
|
|
last_b = [0]
|
|
|
|
def update_to(b=1, bsize=1, tsize=None):
|
|
"""
|
|
b : int, optional
|
|
Number of blocks transferred so far [default: 1].
|
|
bsize : int, optional
|
|
Size of each block (in tqdm units) [default: 1].
|
|
tsize : int, optional
|
|
Total size (in tqdm units). If [default: None] or -1,
|
|
remains unchanged.
|
|
"""
|
|
if tsize not in (None, -1):
|
|
t.total = tsize
|
|
displayed = t.update((b - last_b[0]) * bsize)
|
|
last_b[0] = b
|
|
return displayed
|
|
|
|
return update_to
|
|
|
|
|
|
# This function is copied from lhotse
|
|
def urlretrieve_progress(url, filename=None, data=None, desc=None):
|
|
"""
|
|
Works exactly like urllib.request.urlretrieve, but attaches a tqdm hook to
|
|
display a progress bar of the download.
|
|
Use "desc" argument to display a user-readable string that informs what is
|
|
being downloaded.
|
|
"""
|
|
from urllib.request import urlretrieve
|
|
|
|
with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=desc) as t:
|
|
reporthook = tqdm_urlretrieve_hook(t)
|
|
return urlretrieve(url=url, filename=filename, reporthook=reporthook, data=data)
|
|
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--out-dir", type=str, help="Output directory.")
|
|
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def main(out_dir: str):
|
|
url = "http://www.openslr.org/resources/11"
|
|
out_dir = Path(out_dir)
|
|
|
|
files_to_download = (
|
|
"3-gram.pruned.1e-7.arpa.gz",
|
|
"4-gram.arpa.gz",
|
|
"librispeech-vocab.txt",
|
|
"librispeech-lexicon.txt",
|
|
"librispeech-lm-norm.txt.gz",
|
|
)
|
|
|
|
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
|
|
filename = out_dir / f
|
|
if filename.is_file() is False:
|
|
urlretrieve_progress(
|
|
f"{url}/{f}",
|
|
filename=filename,
|
|
desc=f"Downloading {filename}",
|
|
)
|
|
else:
|
|
logging.info(f"{filename} already exists - skipping")
|
|
|
|
if ".gz" in str(filename):
|
|
unzipped = Path(os.path.splitext(filename)[0])
|
|
if unzipped.is_file() is False:
|
|
with gzip.open(filename, "rb") as f_in:
|
|
with open(unzipped, "wb") as f_out:
|
|
shutil.copyfileobj(f_in, f_out)
|
|
else:
|
|
logging.info(f"{unzipped} already exist - skipping")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
|
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
|
|
args = get_args()
|
|
logging.info(f"out_dir: {args.out_dir}")
|
|
|
|
main(out_dir=args.out_dir)
|