icefall/egs/librispeech/ASR/local/download_lm.py
Fangjun Kuang 5a0b9bcb23
Refactoring (#4)
* Fix an error in TDNN-LSTM training.

* WIP: Refactoring

* Refactor transformer.py

* Remove unused code.

* Minor fixes.
2021-08-04 14:53:02 +08:00

83 lines
2.1 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
"""
This file downloads the following LibriSpeech LM files:
- 3-gram.pruned.1e-7.arpa.gz
- 4-gram.arpa.gz
- librispeech-vocab.txt
- librispeech-lexicon.txt
from http://www.openslr.org/resources/11
and save them in the user provided directory.
Files are not re-downloaded if they already exist.
Usage:
./local/download_lm.py --out-dir ./download/lm
"""
import argparse
import gzip
import logging
import os
import shutil
from pathlib import Path
from lhotse.utils import urlretrieve_progress
from tqdm.auto import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--out-dir", type=str, help="Output directory.")
args = parser.parse_args()
return args
def main(out_dir: str):
url = "http://www.openslr.org/resources/11"
out_dir = Path(out_dir)
files_to_download = (
"3-gram.pruned.1e-7.arpa.gz",
"4-gram.arpa.gz",
"librispeech-vocab.txt",
"librispeech-lexicon.txt",
)
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
filename = out_dir / f
if filename.is_file() is False:
urlretrieve_progress(
f"{url}/{f}",
filename=filename,
desc=f"Downloading {filename}",
)
else:
logging.info(f"{filename} already exists - skipping")
if ".gz" in str(filename):
unzipped = Path(os.path.splitext(filename)[0])
if unzipped.is_file() is False:
with gzip.open(filename, "rb") as f_in:
with open(unzipped, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
else:
logging.info(f"{unzipped} already exist - skipping")
if __name__ == "__main__":
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
logging.info(f"out_dir: {args.out_dir}")
main(out_dir=args.out_dir)