use huggingface_hub library to download mls_english

This commit is contained in:
Kinan Martin 2025-05-22 09:15:12 +09:00
parent f3f04fa626
commit 1f11ba4d28
2 changed files with 64 additions and 3 deletions

View File

@ -0,0 +1,45 @@
import argparse
import os
import sys
from huggingface_hub import snapshot_download
def download_dataset(dl_dir):
"""
Downloads the MLS English dataset from Hugging Face to `$dl_dir/mls_english`.
"""
repo_id = 'parler-tts/mls_eng'
local_dataset_dir = os.path.join(dl_dir, 'mls_english')
print(f"Attempting to download '{repo_id}' to '{local_dataset_dir}'...")
# Ensure the parent directory exists
os.makedirs(dl_dir, exist_ok=True)
try:
# snapshot_download handles LFS and large files robustly
# local_dir_use_symlinks=False is generally safer for datasets,
# especially on network file systems or if you intend to move the data
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
local_dir=local_dataset_dir,
local_dir_use_symlinks=False,
)
print(f"Successfully downloaded '{repo_id}' to '{local_dataset_dir}'")
except Exception as e:
print(f"Error downloading dataset '{repo_id}': {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download MLS English dataset from Hugging Face."
)
parser.add_argument(
"--dl-dir",
type=str,
required=True,
help="The base directory where the 'mls_english' dataset will be downloaded.",
)
args = parser.parse_args()
download_dataset(args.dl_dir)

View File

@ -33,11 +33,27 @@ log "Starting MLS English data preparation"
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download MLS English dataset" log "Stage 0: Download MLS English dataset"
if [ ! -d $dl_dir/mls_english ]; then # Check if huggingface_hub is installed
if ! git clone git@hf.co:datasets/parler-tts/mls_eng $dl_dir/mls_english; then if ! python -c "import huggingface_hub" &> /dev/null; then
log "Failed to download MLS English dataset" log "huggingface_hub Python library not found. Installing it now..."
# Using --break-system-packages for Debian/Ubuntu environments where pip install might fail without it
python -m pip install huggingface_hub || \
python -m pip install huggingface_hub --break-system-packages || { \
log "Failed to install huggingface_hub. Please install it manually: pip install huggingface_hub"; \
exit 1; \
}
log "huggingface_hub installed successfully."
fi
# Check if the dataset already exists to avoid re-downloading
if [ ! -d "$dl_dir/mls_english" ]; then
log "Dataset not found at $dl_dir/mls_english. Starting download..."
if ! python ./local/utils/download_mls_english.py --dl-dir "$dl_dir"; then
log "Failed to download MLS English dataset via download_mls_english.py"
exit 1 exit 1
fi fi
else
log "Dataset already exists at $dl_dir/mls_english. Skipping download."
fi fi
fi fi