From 53111d0e4670fc51d22d42b184595ecf84940bea Mon Sep 17 00:00:00 2001 From: yfyeung Date: Wed, 18 Jun 2025 07:33:15 +0000 Subject: [PATCH] fix for multigpu --- egs/speech_llm/ASR_LLM/zipformer_llm_zh/train.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/egs/speech_llm/ASR_LLM/zipformer_llm_zh/train.py b/egs/speech_llm/ASR_LLM/zipformer_llm_zh/train.py index c668e7f64..7565dd98b 100755 --- a/egs/speech_llm/ASR_LLM/zipformer_llm_zh/train.py +++ b/egs/speech_llm/ASR_LLM/zipformer_llm_zh/train.py @@ -693,6 +693,9 @@ def train_one_epoch( exclude_frozen_parameters=True, ) + if world_size > 1: + torch.distributed.barrier() + if rank == 0: convert_zero_checkpoint_to_fp32_state_dict( params.exp_dir, @@ -710,6 +713,9 @@ def train_one_epoch( f"rm -rf {params.exp_dir}/epoch-{params.cur_epoch}-checkpoint-{batch_idx}" ) + if world_size > 1: + torch.distributed.barrier() + shave_rate = params.shave_rate while True: try: @@ -991,6 +997,10 @@ def run(rank, world_size, args): client_state={}, exclude_frozen_parameters=True, ) + + if world_size > 1: + torch.distributed.barrier() + if rank == 0: convert_zero_checkpoint_to_fp32_state_dict( params.exp_dir,