mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 18:12:19 +00:00
use huggingface_hub library to download mls_english
This commit is contained in:
parent
f3f04fa626
commit
1f11ba4d28
45
egs/mls_english/ASR/local/utils/download_mls_english.py
Normal file
45
egs/mls_english/ASR/local/utils/download_mls_english.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
def download_dataset(dl_dir):
|
||||||
|
"""
|
||||||
|
Downloads the MLS English dataset from Hugging Face to `$dl_dir/mls_english`.
|
||||||
|
"""
|
||||||
|
repo_id = 'parler-tts/mls_eng'
|
||||||
|
local_dataset_dir = os.path.join(dl_dir, 'mls_english')
|
||||||
|
|
||||||
|
print(f"Attempting to download '{repo_id}' to '{local_dataset_dir}'...")
|
||||||
|
|
||||||
|
# Ensure the parent directory exists
|
||||||
|
os.makedirs(dl_dir, exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# snapshot_download handles LFS and large files robustly
|
||||||
|
# local_dir_use_symlinks=False is generally safer for datasets,
|
||||||
|
# especially on network file systems or if you intend to move the data
|
||||||
|
snapshot_download(
|
||||||
|
repo_id=repo_id,
|
||||||
|
repo_type="dataset",
|
||||||
|
local_dir=local_dataset_dir,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
print(f"Successfully downloaded '{repo_id}' to '{local_dataset_dir}'")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error downloading dataset '{repo_id}': {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Download MLS English dataset from Hugging Face."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dl-dir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The base directory where the 'mls_english' dataset will be downloaded.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
download_dataset(args.dl_dir)
|
@ -33,11 +33,27 @@ log "Starting MLS English data preparation"
|
|||||||
|
|
||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
log "Stage 0: Download MLS English dataset"
|
log "Stage 0: Download MLS English dataset"
|
||||||
if [ ! -d $dl_dir/mls_english ]; then
|
# Check if huggingface_hub is installed
|
||||||
if ! git clone git@hf.co:datasets/parler-tts/mls_eng $dl_dir/mls_english; then
|
if ! python -c "import huggingface_hub" &> /dev/null; then
|
||||||
log "Failed to download MLS English dataset"
|
log "huggingface_hub Python library not found. Installing it now..."
|
||||||
|
# Using --break-system-packages for Debian/Ubuntu environments where pip install might fail without it
|
||||||
|
python -m pip install huggingface_hub || \
|
||||||
|
python -m pip install huggingface_hub --break-system-packages || { \
|
||||||
|
log "Failed to install huggingface_hub. Please install it manually: pip install huggingface_hub"; \
|
||||||
|
exit 1; \
|
||||||
|
}
|
||||||
|
log "huggingface_hub installed successfully."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if the dataset already exists to avoid re-downloading
|
||||||
|
if [ ! -d "$dl_dir/mls_english" ]; then
|
||||||
|
log "Dataset not found at $dl_dir/mls_english. Starting download..."
|
||||||
|
if ! python ./local/utils/download_mls_english.py --dl-dir "$dl_dir"; then
|
||||||
|
log "Failed to download MLS English dataset via download_mls_english.py"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
log "Dataset already exists at $dl_dir/mls_english. Skipping download."
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user