diff --git a/egs/librispeech/ASR/conformer_ctc/run-multi-node-multi-gpu.sh b/egs/librispeech/ASR/conformer_ctc/run-multi-node-multi-gpu.sh new file mode 100755 index 000000000..ab0a7e79f --- /dev/null +++ b/egs/librispeech/ASR/conformer_ctc/run-multi-node-multi-gpu.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# +# This script is the entry point to start model training +# with multi-node multi-GPU. +# +# Read the usage instructions for how to run this script. + +set -e + +cur_dir=$(cd $(dirname $BASH_SOURCE) && pwd) + +# DDP related parameters +master_addr= +node_rank= +num_nodes= +master_port=1234 + +# Training script parameters +# You can add more if you like +# +# Use ./conformer_ctc/train.py --help to see more +# +max_duration=200 +bucketing_sampler=1 +full_libri=1 +start_epoch=0 +num_epochs=2 +exp_dir=conformer_ctc/exp3 +lang_dir=data/lang_bpe_500 + +. $cur_dir/../shared/parse_options.sh + +function usage() { + echo "Usage: " + echo "" + echo " $0 \\" + echo " --master-addr \\" + echo " --master-port \\" + echo " --node-rank \\" + echo " --num-nodes " + echo "" + echo " --master-addr The ip address of the master node." + echo " --master-port The port of the master node." + echo " --node-rank Rank of this node." + echo " --num-nodes Number of nodes in DDP training." + echo "" + echo "Usage example:" + echo "Suppose you want to use DDP with two machines:" + echo " (1) Machine 1 has 4 GPUs. You want to use" + echo " GPU 0, 1, and 3 for training" + echo " IP of machine 1 is: 10.177.41.71" + echo " (2) Machine 2 has 4 GPUs. You want to use" + echo " GPU 0, 2, and 3 for training" + echo " IP of machine 2 is: 10.177.41.72" + echo "You want to select machine 1 as the master node and" + echo "assume that the port 1234 is free on machine 1." + echo "" + echo "On machine 1, you run:" + echo "" + echo " export CUDA_VISIBLE_DEVICES=\"0,1,3\"" + echo " ./conformer_ctc/run-multi-node-multi-gpu.sh --master-addr 10.177.41.71 --master-port 1234 --node-rank 0 --num-nodes 2" + echo "" + echo "On machine 2, you run:" + echo "" + echo " export CUDA_VISIBLE_DEVICES=\"0,2,3\"" + echo " ./conformer_ctc/run-multi-node-multi-gpu.sh --master-addr 10.177.41.71 --master-port 1234 --node-rank 1 --num-nodes 3" + exit 1 +} + +default='\033[0m' +bold='\033[1m' +red='\033[31m' + +function error() { + printf "${bold}${red}[ERROR]${default} $1\n" +} + +[ ! -z $CUDA_VISIBLE_DEVICES ] || ( echo; error "Please set CUDA_VISIBLE_DEVICES"; echo; usage ) +[ ! -z $master_addr ] || ( echo; error "Please set --master-addr"; echo; usage ) +[ ! -z $master_port ] || ( echo; error "Please set --master-port"; echo; usage ) +[ ! -z $node_rank ] || ( echo; error "Please set --node-rank"; echo; usage ) +[ ! -z $num_nodes ] || ( echo; error "Please set --num-nodes"; echo; usage ) + +# Number of GPUs this node has +num_gpus=$(python3 -c "s=\"$CUDA_VISIBLE_DEVICES\"; print(len(s.split(',')))") + +echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +echo "num_gpus: $num_gpus" +echo "master_addr: $master_addr" + +export MASTER_ADDR=$master_addr +export MASTER_PORT=$master_port + +set -x + +python -m torch.distributed.launch \ + --use_env \ + --nproc_per_node $num_gpus \ + --nnodes $num_nodes \ + --node_rank $node_rank \ + --master_addr $master_addr \ + --master_port $master_port \ + \ + $cur_dir/train.py \ + --use-multi-node true \ + --master-port $master_port \ + --max-duration $max_duration \ + --bucketing-sampler $bucketing_sampler \ + --full-libri $full_libri \ + --start-epoch $start_epoch \ + --num-epochs $num_epochs \ + --exp-dir $exp_dir \ + --lang-dir $lang_dir