diff --git a/egs/mls_english/ASR/local/utils/download_mls_english.py b/egs/mls_english/ASR/local/utils/download_mls_english.py new file mode 100644 index 000000000..82cddb08f --- /dev/null +++ b/egs/mls_english/ASR/local/utils/download_mls_english.py @@ -0,0 +1,45 @@ +import argparse +import os +import sys +from huggingface_hub import snapshot_download + +def download_dataset(dl_dir): + """ + Downloads the MLS English dataset from Hugging Face to `$dl_dir/mls_english`. + """ + repo_id = 'parler-tts/mls_eng' + local_dataset_dir = os.path.join(dl_dir, 'mls_english') + + print(f"Attempting to download '{repo_id}' to '{local_dataset_dir}'...") + + # Ensure the parent directory exists + os.makedirs(dl_dir, exist_ok=True) + + try: + # snapshot_download handles LFS and large files robustly + # local_dir_use_symlinks=False is generally safer for datasets, + # especially on network file systems or if you intend to move the data + snapshot_download( + repo_id=repo_id, + repo_type="dataset", + local_dir=local_dataset_dir, + local_dir_use_symlinks=False, + ) + print(f"Successfully downloaded '{repo_id}' to '{local_dataset_dir}'") + except Exception as e: + print(f"Error downloading dataset '{repo_id}': {e}", file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Download MLS English dataset from Hugging Face." + ) + parser.add_argument( + "--dl-dir", + type=str, + required=True, + help="The base directory where the 'mls_english' dataset will be downloaded.", + ) + args = parser.parse_args() + + download_dataset(args.dl_dir) \ No newline at end of file diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index 91dbedda5..9cfd314b2 100755 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -33,11 +33,27 @@ log "Starting MLS English data preparation" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download MLS English dataset" - if [ ! -d $dl_dir/mls_english ]; then - if ! git clone git@hf.co:datasets/parler-tts/mls_eng $dl_dir/mls_english; then - log "Failed to download MLS English dataset" + # Check if huggingface_hub is installed + if ! python -c "import huggingface_hub" &> /dev/null; then + log "huggingface_hub Python library not found. Installing it now..." + # Using --break-system-packages for Debian/Ubuntu environments where pip install might fail without it + python -m pip install huggingface_hub || \ + python -m pip install huggingface_hub --break-system-packages || { \ + log "Failed to install huggingface_hub. Please install it manually: pip install huggingface_hub"; \ + exit 1; \ + } + log "huggingface_hub installed successfully." + fi + + # Check if the dataset already exists to avoid re-downloading + if [ ! -d "$dl_dir/mls_english" ]; then + log "Dataset not found at $dl_dir/mls_english. Starting download..." + if ! python ./local/utils/download_mls_english.py --dl-dir "$dl_dir"; then + log "Failed to download MLS English dataset via download_mls_english.py" exit 1 fi + else + log "Dataset already exists at $dl_dir/mls_english. Skipping download." fi fi