Changes to more accurately estimate OOM conditions
This commit is contained in:
parent
1e8984174b
commit
9da5526659
@ -450,7 +450,13 @@ class ConformerEncoder(nn.Module):
|
||||
initial_layerdrop_prob = 0.5
|
||||
final_layerdrop_prob = 0.05
|
||||
|
||||
if warmup_count < layer_warmup_begin:
|
||||
if warmup_count < 100.0:
|
||||
# As a special case, if warmup_count < 100.0 return 0 (drop no
|
||||
# layers). This is rather ugly, I'm afraid; it is intended to
|
||||
# enable our scan_pessimistic_batches_for_oom() code to work correctly
|
||||
# so if we are going to get OOM it will happen early.
|
||||
return 0.0
|
||||
elif warmup_count < layer_warmup_begin:
|
||||
return initial_layerdrop_prob
|
||||
elif warmup_count > layer_warmup_end:
|
||||
return final_layerdrop_prob
|
||||
|
||||
@ -1172,6 +1172,7 @@ def scan_pessimistic_batches_for_oom(
|
||||
)
|
||||
display_and_save_batch(batch, params=params, sp=sp)
|
||||
raise
|
||||
logging.info(f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user