Fix OOM handling when using DDP.

We have to disable batch norm layers. Otherwise, the process will hang indefinitely.
2025-12-10 22:45:27 +00:00 · 2021-08-15 18:49:12 +08:00 · 2021-08-15 18:49:12 +08:00 · 21292066ec
commit 21292066ec
parent 14e0886559
3 changed files with 59 additions and 76 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,6 @@
 repos:
  - repo: https://github.com/psf/black
-    rev: 21.6b0
+    rev: 21.7b0
    hooks:
      - id: black
        args: [--line-length=80]
--- a/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/conformer.py
@ -869,7 +869,10 @@ class ConvolutionModule(nn.Module):
            groups=channels,
            bias=bias,
        )
-        self.norm = nn.BatchNorm1d(channels)
+        # NOTE(fangjun): The process hangs when using DDP
+        # if we try to recover from CUDA OOM, so we disable
+        # batchnorm layer here.
+        #  self.norm = nn.BatchNorm1d(channels)
        self.pointwise_conv2 = nn.Conv1d(
            channels,
            channels,
@ -899,7 +902,8 @@ class ConvolutionModule(nn.Module):

        # 1D Depthwise Conv
        x = self.depthwise_conv(x)
-        x = self.activation(self.norm(x))
+        #  x = self.activation(self.norm(x))
+        x = self.activation(x)

        x = self.pointwise_conv2(x)  # (batch, channel, time)

--- a/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/train.py
+++ b/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/train.py
@ -153,7 +153,7 @@ def get_params() -> AttributeDict:
            "num_decoder_layers": 6,
            "is_espnet_structure": True,
            "mmi_loss": False,
-            "use_feat_batchnorm": True,
+            "use_feat_batchnorm": False,
            "lr_factor": 2.0,
            "warm_step": 30000,
        }
@ -282,13 +282,10 @@ def compute_loss_impl(
    assert feature.ndim == 3
    feature = feature.to(device)

-    try:
-
    supervisions = batch["supervisions"]
+
    with torch.set_grad_enabled(is_training):
-            nnet_output, encoder_memory, memory_mask = model(
-                feature, supervisions
-            )
+        nnet_output, encoder_memory, memory_mask = model(feature, supervisions)
        # nnet_output is [N, T, C]

    # NOTE: We need `encode_supervisions` to sort sequences with
@ -334,23 +331,10 @@ def compute_loss_impl(
                    sos_id=graph_compiler.sos_id,
                    eos_id=graph_compiler.eos_id,
                )
-            loss = (
-                1.0 - params.att_rate
-            ) * ctc_loss + params.att_rate * att_loss
+        loss = (1.0 - params.att_rate) * ctc_loss + params.att_rate * att_loss
    else:
        loss = ctc_loss
        att_loss = torch.tensor([0])
-    except RuntimeError as ex:
-        try:
-            del nnet_output
-            del encoder_memory
-            del dense_fsa_vec
-            del ctc_loss
-            del att_loss
-            del loss
-        except NameError as ne:
-            pass
-        raise ex

    # train_frames and valid_frames are used for printing.
    if is_training:
@ -394,11 +378,6 @@ def compute_loss(
        s += f" max duration: {max_cut_duration:.3f} s \n"
        logging.info(s)

-    # see https://github.com/pytorch/fairseq/blob/50a671f78d0c8de0392f924180db72ac9b41b801/fairseq/trainer.py#L283
-    for p in model.parameters():
-        if p.grad is not None:
-            del p.grad  # free some memory
-
    torch.cuda.empty_cache()

    gc.collect()