fix prepare.sh

This commit is contained in:
Your Name 2024-11-11 22:45:22 -08:00
parent 77560cd5e8
commit de469c0b65

View File

@ -132,11 +132,12 @@ fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Extract speech tokens." log "Stage 4: Extract speech tokens."
mkdir -p $tokens_dir
for subset in small medium large; do for subset in small medium large; do
log "Extract speech tokens for subset: $subset" if [ ! -e $tokens_dir/libriheavy_${subset}.jsonl.gz ]; then
output_dir=$tokens_dir/libriheavy_${subset} echo $tokens_dir/libriheavy_${subset}.jsonl.gz
mkdir -p $tokens_dir log "Extract speech tokens for subset: $subset"
if [ ! -e $tokens_dir/.extract_completed ]; then output_dir=$tokens_dir/libriheavy_${subset}
torchrun --nproc_per_node=8 \ torchrun --nproc_per_node=8 \
--nnodes=1 \ --nnodes=1 \
--rdzv_id=2024 \ --rdzv_id=2024 \
@ -148,8 +149,8 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
--output_dir $output_dir \ --output_dir $output_dir \
--batch_size 32 \ --batch_size 32 \
--model "speech_tokenizer_v1" --model "speech_tokenizer_v1"
cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir cat $output_dir/part* | gzip > $output_dir/libriheavy_${subset}.jsonl.gz && rm -rf $output_dir
touch $output_dir/.extract_completed
fi fi
done done
fi fi