mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
instead of on-the-fly features, precompute fbank and manifests in prepare.sh
This commit is contained in:
parent
4e2a4fdcd8
commit
4f743993ef
@ -94,6 +94,7 @@ def get_args():
|
|||||||
)
|
)
|
||||||
parser.add_argument("-m", "--manifest-dir", type=Path)
|
parser.add_argument("-m", "--manifest-dir", type=Path)
|
||||||
parser.add_argument("-a", "--audio-dir", type=Path)
|
parser.add_argument("-a", "--audio-dir", type=Path)
|
||||||
|
parser.add_argument("-d", "--dl-dir", type=Path)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -114,7 +115,7 @@ def main():
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
mls_eng_hf_dataset_path = "/root/datasets/parler-tts--mls_eng"
|
mls_eng_hf_dataset_path = args.dl_dir # "/root/datasets/parler-tts--mls_eng"
|
||||||
cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
|
cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
|
||||||
for part, cut_set in cut_sets:
|
for part, cut_set in cut_sets:
|
||||||
logging.info(f"Processing {part}")
|
logging.info(f"Processing {part}")
|
||||||
@ -125,8 +126,8 @@ def main():
|
|||||||
storage_type=LilcomChunkyWriter,
|
storage_type=LilcomChunkyWriter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# cut_set.save_audios(args.audio_dir)
|
cut_set = cut_set.save_audios(args.audio_dir / part) # makes new cutset that uses paths to actual audio files
|
||||||
# cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
|
cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
|
||||||
|
|
||||||
logging.info("All fbank computed for MLS English.")
|
logging.info("All fbank computed for MLS English.")
|
||||||
(args.manifest_dir / ".mls-eng-fbank.done").touch()
|
(args.manifest_dir / ".mls-eng-fbank.done").touch()
|
||||||
|
@ -41,19 +41,42 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
log "Stage 1: Prepare MLS English manifests and compute fbank"
|
||||||
|
# We assume that you have downloaded the MLS English corpus
|
||||||
|
# to $dl_dir/mls_english
|
||||||
|
mkdir -p data/manifests
|
||||||
|
if [ ! -e data/mls_english.done ]; then
|
||||||
|
# lhotse prepare mls_english -j $nj $dl_dir/mls_english data/manifests
|
||||||
|
python local/compute_fbank_mls_english.py --manifest-dir data/manifests --audio-dir data/audio --dl-dir $dl_dir/mls_english
|
||||||
|
touch data/manifests/.mls_english.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
|
log "Stage 2: Validate MLS English manifests"
|
||||||
|
if [ ! -e data/manifests/.mls_english-validated.done ]; then
|
||||||
|
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_train.jsonl.gz
|
||||||
|
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_dev.jsonl.gz
|
||||||
|
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_test.jsonl.gz
|
||||||
|
touch data/manifests/.mls_english-validated.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
mkdir -p data/lang
|
mkdir -p data/lang
|
||||||
lang_dir=data/lang
|
lang_dir=data/lang
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
log "Stage 1: Prepare transcript for BPE training"
|
log "Stage 3: Prepare transcript for BPE training"
|
||||||
if [ ! -f $lang_dir/transcript.txt ]; then
|
if [ ! -f $lang_dir/transcript.txt ]; then
|
||||||
log "Generating transcripts for BPE training"
|
log "Generating transcripts for BPE training"
|
||||||
./local/utils/generate_transcript.py --lang-dir $lang_dir
|
./local/utils/generate_transcript.py --lang-dir $lang_dir
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
log "Stage 2: Prepare BPE tokenizer"
|
log "Stage 4: Prepare BPE tokenizer"
|
||||||
|
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
log "Training BPE model with vocab_size=${vocab_size}"
|
log "Training BPE model with vocab_size=${vocab_size}"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user