mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 18:12:19 +00:00
51 lines
1.4 KiB
Python
Executable File
51 lines
1.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
|
|
"""
|
|
This file downloads librispeech LM files to data/lm
|
|
"""
|
|
|
|
import gzip
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from lhotse.utils import urlretrieve_progress
|
|
from tqdm.auto import tqdm
|
|
|
|
|
|
def download_lm():
|
|
url = "http://www.openslr.org/resources/11"
|
|
target_dir = Path("data/lm")
|
|
|
|
files_to_download = (
|
|
"3-gram.pruned.1e-7.arpa.gz",
|
|
"4-gram.arpa.gz",
|
|
"librispeech-vocab.txt",
|
|
"librispeech-lexicon.txt",
|
|
)
|
|
|
|
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
|
|
filename = target_dir / f
|
|
if filename.is_file() is False:
|
|
urlretrieve_progress(
|
|
f"{url}/{f}",
|
|
filename=filename,
|
|
desc=f"Downloading {filename}",
|
|
)
|
|
else:
|
|
print(f"{filename} already exists - skipping")
|
|
|
|
if ".gz" in str(filename):
|
|
unzip_file = Path(os.path.splitext(filename)[0])
|
|
if unzip_file.is_file() is False:
|
|
with gzip.open(filename, "rb") as f_in:
|
|
with open(unzip_file, "wb") as f_out:
|
|
shutil.copyfileobj(f_in, f_out)
|
|
else:
|
|
print(f"{unzip_file} already exist - skipping")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
download_lm()
|