mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
Add missing files.
This commit is contained in:
parent
c688161f44
commit
de42201893
113
egs/librispeech/ASR/conformer_ctc/run-multi-node-multi-gpu.sh
Executable file
113
egs/librispeech/ASR/conformer_ctc/run-multi-node-multi-gpu.sh
Executable file
@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# This script is the entry point to start model training
|
||||
# with multi-node multi-GPU.
|
||||
#
|
||||
# Read the usage instructions for how to run this script.
|
||||
|
||||
set -e
|
||||
|
||||
cur_dir=$(cd $(dirname $BASH_SOURCE) && pwd)
|
||||
|
||||
# DDP related parameters
|
||||
master_addr=
|
||||
node_rank=
|
||||
num_nodes=
|
||||
master_port=1234
|
||||
|
||||
# Training script parameters
|
||||
# You can add more if you like
|
||||
#
|
||||
# Use ./conformer_ctc/train.py --help to see more
|
||||
#
|
||||
max_duration=200
|
||||
bucketing_sampler=1
|
||||
full_libri=1
|
||||
start_epoch=0
|
||||
num_epochs=2
|
||||
exp_dir=conformer_ctc/exp3
|
||||
lang_dir=data/lang_bpe_500
|
||||
|
||||
. $cur_dir/../shared/parse_options.sh
|
||||
|
||||
function usage() {
|
||||
echo "Usage: "
|
||||
echo ""
|
||||
echo " $0 \\"
|
||||
echo " --master-addr <IP of master> \\"
|
||||
echo " --master-port <Port of master> \\"
|
||||
echo " --node-rank <rank of this node> \\"
|
||||
echo " --num-nodes <Number of node>"
|
||||
echo ""
|
||||
echo " --master-addr The ip address of the master node."
|
||||
echo " --master-port The port of the master node."
|
||||
echo " --node-rank Rank of this node."
|
||||
echo " --num-nodes Number of nodes in DDP training."
|
||||
echo ""
|
||||
echo "Usage example:"
|
||||
echo "Suppose you want to use DDP with two machines:"
|
||||
echo " (1) Machine 1 has 4 GPUs. You want to use"
|
||||
echo " GPU 0, 1, and 3 for training"
|
||||
echo " IP of machine 1 is: 10.177.41.71"
|
||||
echo " (2) Machine 2 has 4 GPUs. You want to use"
|
||||
echo " GPU 0, 2, and 3 for training"
|
||||
echo " IP of machine 2 is: 10.177.41.72"
|
||||
echo "You want to select machine 1 as the master node and"
|
||||
echo "assume that the port 1234 is free on machine 1."
|
||||
echo ""
|
||||
echo "On machine 1, you run:"
|
||||
echo ""
|
||||
echo " export CUDA_VISIBLE_DEVICES=\"0,1,3\""
|
||||
echo " ./conformer_ctc/run-multi-node-multi-gpu.sh --master-addr 10.177.41.71 --master-port 1234 --node-rank 0 --num-nodes 2"
|
||||
echo ""
|
||||
echo "On machine 2, you run:"
|
||||
echo ""
|
||||
echo " export CUDA_VISIBLE_DEVICES=\"0,2,3\""
|
||||
echo " ./conformer_ctc/run-multi-node-multi-gpu.sh --master-addr 10.177.41.71 --master-port 1234 --node-rank 1 --num-nodes 3"
|
||||
exit 1
|
||||
}
|
||||
|
||||
default='\033[0m'
|
||||
bold='\033[1m'
|
||||
red='\033[31m'
|
||||
|
||||
function error() {
|
||||
printf "${bold}${red}[ERROR]${default} $1\n"
|
||||
}
|
||||
|
||||
[ ! -z $CUDA_VISIBLE_DEVICES ] || ( echo; error "Please set CUDA_VISIBLE_DEVICES"; echo; usage )
|
||||
[ ! -z $master_addr ] || ( echo; error "Please set --master-addr"; echo; usage )
|
||||
[ ! -z $master_port ] || ( echo; error "Please set --master-port"; echo; usage )
|
||||
[ ! -z $node_rank ] || ( echo; error "Please set --node-rank"; echo; usage )
|
||||
[ ! -z $num_nodes ] || ( echo; error "Please set --num-nodes"; echo; usage )
|
||||
|
||||
# Number of GPUs this node has
|
||||
num_gpus=$(python3 -c "s=\"$CUDA_VISIBLE_DEVICES\"; print(len(s.split(',')))")
|
||||
|
||||
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
|
||||
echo "num_gpus: $num_gpus"
|
||||
echo "master_addr: $master_addr"
|
||||
|
||||
export MASTER_ADDR=$master_addr
|
||||
export MASTER_PORT=$master_port
|
||||
|
||||
set -x
|
||||
|
||||
python -m torch.distributed.launch \
|
||||
--use_env \
|
||||
--nproc_per_node $num_gpus \
|
||||
--nnodes $num_nodes \
|
||||
--node_rank $node_rank \
|
||||
--master_addr $master_addr \
|
||||
--master_port $master_port \
|
||||
\
|
||||
$cur_dir/train.py \
|
||||
--use-multi-node true \
|
||||
--master-port $master_port \
|
||||
--max-duration $max_duration \
|
||||
--bucketing-sampler $bucketing_sampler \
|
||||
--full-libri $full_libri \
|
||||
--start-epoch $start_epoch \
|
||||
--num-epochs $num_epochs \
|
||||
--exp-dir $exp_dir \
|
||||
--lang-dir $lang_dir
|
Loading…
x
Reference in New Issue
Block a user