diff --git a/egs/mls_english/ASR/local/display_manifest_statistics.py b/egs/mls_english/ASR/local/display_manifest_statistics.py new file mode 100644 index 000000000..b128a08e0 --- /dev/null +++ b/egs/mls_english/ASR/local/display_manifest_statistics.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# 2022 The University of Electro-Communications (author: Teo Wen Shen) # noqa +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from pathlib import Path + +from lhotse import CutSet, load_manifest + +ARGPARSE_DESCRIPTION = """ +This file displays duration statistics of utterances in a manifest. +You can use the displayed value to choose minimum/maximum duration +to remove short and long utterances during the training. + +See the function `remove_short_and_long_utt()` in +pruned_transducer_stateless5/train.py for usage. +""" + + +def get_parser(): + parser = argparse.ArgumentParser( + description=ARGPARSE_DESCRIPTION, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument("--manifest-dir", type=Path, help="Path to cutset manifests") + + return parser.parse_args() + + +def main(): + args = get_parser() + + for part in ["dev", "test", "train"]: + path = args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz" + cuts: CutSet = load_manifest(path) + + print("\n---------------------------------\n") + print(path.name + ":") + cuts.describe() + + +if __name__ == "__main__": + main() diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index 9cfd314b2..fe16e367a 100755 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -20,40 +20,40 @@ dl_dir=$PWD/download # All files generated by this script are saved in "data". mkdir -p data -mkdir -p data/audio # Add this line +mkdir -p data/audio mkdir -p data/manifests mkdir -p data/lang log() { - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } log "Starting MLS English data preparation" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download MLS English dataset" - # Check if huggingface_hub is installed - if ! python -c "import huggingface_hub" &> /dev/null; then - log "huggingface_hub Python library not found. Installing it now..." - # Using --break-system-packages for Debian/Ubuntu environments where pip install might fail without it - python -m pip install huggingface_hub || \ - python -m pip install huggingface_hub --break-system-packages || { \ - log "Failed to install huggingface_hub. Please install it manually: pip install huggingface_hub"; \ - exit 1; \ - } - log "huggingface_hub installed successfully." - fi + log "Stage 0: Download MLS English dataset" + # Check if huggingface_hub is installed + if ! python -c "import huggingface_hub" &> /dev/null; then + log "huggingface_hub Python library not found. Installing it now..." + # Using --break-system-packages for Debian/Ubuntu environments where pip install might fail without it + python -m pip install huggingface_hub || \ + python -m pip install huggingface_hub --break-system-packages || { \ + log "Failed to install huggingface_hub. Please install it manually: pip install huggingface_hub"; \ + exit 1; \ + } + log "huggingface_hub installed successfully." + fi # Check if the dataset already exists to avoid re-downloading if [ ! -d "$dl_dir/mls_english" ]; then - log "Dataset not found at $dl_dir/mls_english. Starting download..." - if ! python ./local/utils/download_mls_english.py --dl-dir "$dl_dir"; then - log "Failed to download MLS English dataset via download_mls_english.py" - exit 1 - fi + log "Dataset not found at $dl_dir/mls_english. Starting download..." + if ! python ./local/utils/download_mls_english.py --dl-dir "$dl_dir"; then + log "Failed to download MLS English dataset via download_mls_english.py" + exit 1 + fi else - log "Dataset already exists at $dl_dir/mls_english. Skipping download." + log "Dataset already exists at $dl_dir/mls_english. Skipping download." fi fi @@ -69,45 +69,51 @@ fi # fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Compute MLS English fbank" - if [ ! -e data/manifests/.mls_english-validated.done ]; then - python local/compute_fbank_mls_english.py \ - --manifest-dir data/manifests \ - --audio-dir data/audio \ - --dl-dir $dl_dir/mls_english - # --dl-dir /root/datasets/parler-tts--mls_eng - python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_train.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_dev.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_test.jsonl.gz - touch data/manifests/.mls_english-validated.done - fi + log "Stage 1: Compute MLS English fbank" + if [ ! -e data/manifests/.mls_english-validated.done ]; then + python local/compute_fbank_mls_english.py \ + --manifest-dir data/manifests \ + --audio-dir data/audio \ + --dl-dir $dl_dir/mls_english + # --dl-dir /root/datasets/parler-tts--mls_eng + python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_train.jsonl.gz + python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_dev.jsonl.gz + python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_test.jsonl.gz + touch data/manifests/.mls_english-validated.done + fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Prepare transcript for BPE training" - if [ ! -f data/lang/transcript.txt ]; then - log "Generating transcripts for BPE training" - python local/utils/generate_transcript.py \ - --dataset-path $dl_dir/mls_english \ - --lang-dir data/lang \ - --split train - fi + log "Stage 2: Prepare transcript for BPE training" + if [ ! -f data/lang/transcript.txt ]; then + log "Generating transcripts for BPE training" + python local/utils/generate_transcript.py \ + --dataset-path $dl_dir/mls_english \ + --lang-dir data/lang \ + --split train + fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Prepare BPE tokenizer" - for vocab_size in ${vocab_sizes[@]}; do - log "Training BPE model with vocab_size=${vocab_size}" - bpe_dir=data/lang/bpe_${vocab_size} - mkdir -p $bpe_dir + log "Stage 3: Prepare BPE tokenizer" + for vocab_size in ${vocab_sizes[@]}; do + log "Training BPE model with vocab_size=${vocab_size}" + bpe_dir=data/lang/bpe_${vocab_size} + mkdir -p $bpe_dir - if [ ! -f $bpe_dir/bpe.model ]; then - python local/train_bpe_model.py \ - --lang-dir $bpe_dir \ - --vocab-size $vocab_size \ - --transcript data/lang/transcript.txt - fi - done + if [ ! -f $bpe_dir/bpe.model ]; then + python local/train_bpe_model.py \ + --lang-dir $bpe_dir \ + --vocab-size $vocab_size \ + --transcript data/lang/transcript.txt + fi + done fi -log "MLS English data preparation completed successfully" \ No newline at end of file +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Show manifest statistics" + python local/display_manifest_statistics.py --manifest-dir data/manifests > data/manifests/manifest_statistics.txt + cat data/manifests/manifest_statistics.txt +fi + +log "MLS English data preparation completed successfully"