diff --git a/egs/librispeech/ASR/local/download_data.py b/egs/librispeech/ASR/local/download_data.py deleted file mode 100755 index b9e6232fe..000000000 --- a/egs/librispeech/ASR/local/download_data.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 - -""" -This file downloads the librispeech dataset -to the directory data/LibriSpeech. - -It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh . -""" - - -from lhotse.recipes import download_librispeech - - -def download_data(): - target_dir = "data" - - download_librispeech(target_dir=target_dir, dataset_parts="librispeech") - - -if __name__ == "__main__": - download_data() diff --git a/egs/librispeech/ASR/local/prepare_librispeech_manifest.py b/egs/librispeech/ASR/local/prepare_librispeech_manifest.py deleted file mode 100755 index 357f6e6ea..000000000 --- a/egs/librispeech/ASR/local/prepare_librispeech_manifest.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -""" -This file generates manifests for the librispeech dataset. -It expects the dataset is saved in data/LibriSpeech -and the generated manifests are saved in data/manifests. -""" - -import os -from pathlib import Path - -from lhotse.recipes import prepare_librispeech - - -def prepare_librispeech_mainfest(): - corpus_dir = Path("data/LibriSpeech") - output_dir = Path("data/manifests") - num_jobs = min(15, os.cpu_count()) - - librispeech_manifests = prepare_librispeech( - corpus_dir=corpus_dir, - dataset_parts="auto", - output_dir=output_dir, - num_jobs=num_jobs, - ) - - -if __name__ == "__main__": - prepare_librispeech_mainfest() diff --git a/egs/librispeech/ASR/local/prepare_musan_manifest.py b/egs/librispeech/ASR/local/prepare_musan_manifest.py deleted file mode 100755 index 43b983979..000000000 --- a/egs/librispeech/ASR/local/prepare_musan_manifest.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python3 - -""" -This file generates manifests for the musan dataset. -It expects the dataset is saved in data/musan -and the generated manifests are saved in data/manifests. -""" - -from pathlib import Path - -from lhotse.recipes import prepare_musan - - -def prepare_musan_mainfest(): - corpus_dir = Path("data/musan") - output_dir = Path("data/manifests") - - prepare_musan(corpus_dir=corpus_dir, output_dir=output_dir) - - -if __name__ == "__main__": - prepare_musan_mainfest() diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 89ec47673..1602b9203 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -2,6 +2,7 @@ set -eou pipefail +nj=15 stage=-1 stop_stage=100 @@ -28,7 +29,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then # It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh - ./local/download_data.py + lhotse download librispeech --full data fi # If you have pre-downloaded it to /path/to/musan, @@ -36,8 +37,8 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then # # ln -s /path/to/musan data/ # - if [ ! -e data/musan ]; then - wget https://www.openslr.org/resources/17/musan.tar.gz + if [ ! -f data/musan/.musan_completed ]; then + lhotse download musan data fi fi @@ -46,7 +47,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then # We assume that you have downloaded the librispeech corpus # to data/LibriSpeech mkdir -p data/manifests - ./local/prepare_librispeech_manifest.py + lhotse prepare librispeech -j $nj data/LibriSpeech data/manifests fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then @@ -54,7 +55,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then # We assume that you have downloaded the musan corpus # to data/musan mkdir -p data/manifests - ./local/prepare_musan_manifest.py + lhotse prepare musan data/musan data/manifests fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then