Fix bug: When resuming training from a checkpoint, model_avg was not assigned, resulting in a None error. (#1914)

This commit is contained in:
math345 2025-04-10 11:37:28 +08:00 committed by GitHub
parent 300a821f58
commit 64c5364085
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -593,6 +593,9 @@ def run(rank, world_size, args):
if params.continue_finetune:
assert params.start_epoch > 0, params.start_epoch
if rank == 0:
# model_avg is only used with rank 0
model_avg = copy.deepcopy(model).to(torch.float64)
checkpoints = load_checkpoint_if_available(
params=params, model=model, model_avg=model_avg
)