mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
restore version of mls_english compute_fbank_mls_english.py and prepare.sh from commit 547f5c5
This commit is contained in:
parent
e34f2dbb2a
commit
daff070d68
@ -120,6 +120,11 @@ def main():
|
|||||||
cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
|
cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
|
||||||
for part, cut_set in cut_sets:
|
for part, cut_set in cut_sets:
|
||||||
logging.info(f"Processing {part}")
|
logging.info(f"Processing {part}")
|
||||||
|
cut_set = cut_set.save_audios(
|
||||||
|
num_jobs=num_jobs,
|
||||||
|
storage_path=(args.audio_dir / part).as_posix(),
|
||||||
|
) # makes new cutset that loads audio from paths to actual audio files
|
||||||
|
|
||||||
cut_set = cut_set.compute_and_store_features(
|
cut_set = cut_set.compute_and_store_features(
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
num_jobs=num_jobs,
|
num_jobs=num_jobs,
|
||||||
@ -127,7 +132,6 @@ def main():
|
|||||||
storage_type=LilcomChunkyWriter,
|
storage_type=LilcomChunkyWriter,
|
||||||
)
|
)
|
||||||
|
|
||||||
cut_set = cut_set.save_audios(args.audio_dir / part) # makes new cutset that uses paths to actual audio files
|
|
||||||
cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
|
cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
|
||||||
|
|
||||||
logging.info("All fbank computed for MLS English.")
|
logging.info("All fbank computed for MLS English.")
|
||||||
|
@ -60,9 +60,9 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
--audio-dir data/audio \
|
--audio-dir data/audio \
|
||||||
--dl-dir $dl_dir/mls_english
|
--dl-dir $dl_dir/mls_english
|
||||||
# --dl-dir /root/datasets/parler-tts--mls_eng
|
# --dl-dir /root/datasets/parler-tts--mls_eng
|
||||||
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_train.jsonl.gz
|
python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_train.jsonl.gz
|
||||||
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_dev.jsonl.gz
|
python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_dev.jsonl.gz
|
||||||
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_test.jsonl.gz
|
python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_test.jsonl.gz
|
||||||
touch data/manifests/.mls_english-validated.done
|
touch data/manifests/.mls_english-validated.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@ -71,7 +71,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|||||||
log "Stage 2: Prepare transcript for BPE training"
|
log "Stage 2: Prepare transcript for BPE training"
|
||||||
if [ ! -f data/lang/transcript.txt ]; then
|
if [ ! -f data/lang/transcript.txt ]; then
|
||||||
log "Generating transcripts for BPE training"
|
log "Generating transcripts for BPE training"
|
||||||
./local/utils/generate_transcript.py --lang-dir data/lang
|
python local/utils/generate_transcript.py \
|
||||||
|
--dataset-path $dl_dir/mls_english \
|
||||||
|
--lang-dir data/lang \
|
||||||
|
--split train
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -83,7 +86,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|||||||
mkdir -p $bpe_dir
|
mkdir -p $bpe_dir
|
||||||
|
|
||||||
if [ ! -f $bpe_dir/bpe.model ]; then
|
if [ ! -f $bpe_dir/bpe.model ]; then
|
||||||
./local/train_bpe_model.py \
|
python local/train_bpe_model.py \
|
||||||
--lang-dir $bpe_dir \
|
--lang-dir $bpe_dir \
|
||||||
--vocab-size $vocab_size \
|
--vocab-size $vocab_size \
|
||||||
--transcript data/lang/transcript.txt
|
--transcript data/lang/transcript.txt
|
||||||
|
Loading…
x
Reference in New Issue
Block a user