Fix bug: When resuming training from a checkpoint, model_avg was not assigned, resulting in a None error.

This commit is contained in:
yinfeng 2025-04-09 11:21:00 +08:00
parent 86bd16d496
commit 389dc1c310

View File

@ -593,6 +593,9 @@ def run(rank, world_size, args):
if params.continue_finetune: if params.continue_finetune:
assert params.start_epoch > 0, params.start_epoch assert params.start_epoch > 0, params.start_epoch
if rank == 0:
# model_avg is only used with rank 0
model_avg = copy.deepcopy(model).to(torch.float64)
checkpoints = load_checkpoint_if_available( checkpoints = load_checkpoint_if_available(
params=params, model=model, model_avg=model_avg params=params, model=model, model_avg=model_avg
) )