From 40eed7446080ef1471484f2fb0450467dd9332ac Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 15 Jul 2021 21:09:14 +0800 Subject: [PATCH] Download LM for LibriSpeech. --- .gitignore | 1 + egs/librispeech/ASR/README.md | 4 + egs/librispeech/ASR/local/download_lm.py | 43 ++++++++++ egs/librispeech/ASR/local/parse_options.sh | 97 ++++++++++++++++++++++ egs/librispeech/ASR/prepare.sh | 26 ++++++ 5 files changed, 171 insertions(+) create mode 100644 .gitignore create mode 100644 egs/librispeech/ASR/README.md create mode 100755 egs/librispeech/ASR/local/download_lm.py create mode 100755 egs/librispeech/ASR/local/parse_options.sh create mode 100755 egs/librispeech/ASR/prepare.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..1269488f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +data diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md new file mode 100644 index 000000000..71c333aaf --- /dev/null +++ b/egs/librispeech/ASR/README.md @@ -0,0 +1,4 @@ + +Run `./prepare.sh` to prepare the data. + +Run `./xxx_train.py` (to be added) to train a model. diff --git a/egs/librispeech/ASR/local/download_lm.py b/egs/librispeech/ASR/local/download_lm.py new file mode 100755 index 000000000..fd6713ce8 --- /dev/null +++ b/egs/librispeech/ASR/local/download_lm.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) + +import gzip +import os +import shutil +from pathlib import Path + +from lhotse.utils import urlretrieve_progress +from tqdm.auto import tqdm + + +def download_lm(): + url = "http://www.openslr.org/resources/11" + target_dir = Path("data/lm") + + files_to_download = ( + "3-gram.pruned.1e-7.arpa.gz", + "4-gram.arpa.gz", + "librispeech-vocab.txt", + "librispeech-lexicon.txt", + ) + + for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"): + filename = target_dir / f + if filename.is_file() is False: + urlretrieve_progress( + f"{url}/{f}", + filename=filename, + desc=f"Downloading {filename}", + ) + + if ".gz" in str(filename): + unzip_file = Path(os.path.splitext(filename)[0]) + if unzip_file.is_file() is False: + with gzip.open(filename, "rb") as f_in: + with open(unzip_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + +if __name__ == "__main__": + download_lm() diff --git a/egs/librispeech/ASR/local/parse_options.sh b/egs/librispeech/ASR/local/parse_options.sh new file mode 100755 index 000000000..71fb9e5ea --- /dev/null +++ b/egs/librispeech/ASR/local/parse_options.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh new file mode 100755 index 000000000..861ded0ac --- /dev/null +++ b/egs/librispeech/ASR/prepare.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + + +set -eou pipefail + +stage=-1 +stop_stage=100 + +. local/parse_options.sh || exit 1 + +mkdir -p data + +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + echo "stage -1: Download LM" + mkdir -p data/lm + ./local/download_lm.py +fi + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + echo "stage 0: Download data" + + # If you have pre-downloaded it in /path/to/LibriSpeech + # Just run: ln -sfv /path/to/LibriSpeech data/ + mkdir -p data/LibriSpeech + # TODO +fi