Fix OOM handling when using DDP.

We have to disable batch norm layers. Otherwise, the process will hang indefinitely.
2025-12-10 22:45:27 +00:00 · 2021-08-15 18:49:12 +08:00 · 2021-08-15 18:49:12 +08:00 · 21292066ec
commit 21292066ec
parent 14e0886559
3 changed files with 59 additions and 76 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,6 @@
 repos:
  - repo: https://github.com/psf/black
-    rev: 21.6b0
+    rev: 21.7b0
    hooks:
      - id: black
        args: [--line-length=80]
--- a/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/conformer.py
@ -869,7 +869,10 @@ class ConvolutionModule(nn.Module):
            groups=channels,
            bias=bias,
        )
-        self.norm = nn.BatchNorm1d(channels)
+        # NOTE(fangjun): The process hangs when using DDP
        # if we try to recover from CUDA OOM, so we disable
        # batchnorm layer here.
        #  self.norm = nn.BatchNorm1d(channels)
        self.pointwise_conv2 = nn.Conv1d(
            channels,
            channels,
@ -899,7 +902,8 @@ class ConvolutionModule(nn.Module):
        # 1D Depthwise Conv
        x = self.depthwise_conv(x)
-        x = self.activation(self.norm(x))
+        #  x = self.activation(self.norm(x))
        x = self.activation(x)
        x = self.pointwise_conv2(x)  # (batch, channel, time)
--- a/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/train.py
+++ b/egs/librispeech/ASR/conformer_ctc_madam_no_warmup/train.py
@ -153,7 +153,7 @@ def get_params() -> AttributeDict:
            "num_decoder_layers": 6,
            "is_espnet_structure": True,
            "mmi_loss": False,
-            "use_feat_batchnorm": True,
+            "use_feat_batchnorm": False,
            "lr_factor": 2.0,
            "warm_step": 30000,
        }
@ -282,75 +282,59 @@ def compute_loss_impl(
    assert feature.ndim == 3
    feature = feature.to(device)
-    try:
+    supervisions = batch["supervisions"]
-        supervisions = batch["supervisions"]
+    with torch.set_grad_enabled(is_training):
        nnet_output, encoder_memory, memory_mask = model(feature, supervisions)
        # nnet_output is [N, T, C]
    # NOTE: We need `encode_supervisions` to sort sequences with
    # different duration in decreasing order, required by
    # `k2.intersect_dense` called in `k2.ctc_loss`
    supervision_segments, texts = encode_supervisions(
        supervisions, subsampling_factor=params.subsampling_factor
    )
    token_ids = graph_compiler.texts_to_ids(texts)
    decoding_graph = graph_compiler.compile(token_ids)
    dense_fsa_vec = k2.DenseFsaVec(
        nnet_output,
        supervision_segments,
        allow_truncate=params.subsampling_factor - 1,
    )
    ctc_loss = k2.ctc_loss(
        decoding_graph=decoding_graph,
        dense_fsa_vec=dense_fsa_vec,
        output_beam=params.beam_size,
        reduction=params.reduction,
        use_double_scores=params.use_double_scores,
    )
    if params.att_rate != 0.0:
        with torch.set_grad_enabled(is_training):
-            nnet_output, encoder_memory, memory_mask = model(
+            if hasattr(model, "module"):
-                feature, supervisions
+                att_loss = model.module.decoder_forward(
-            )
+                    encoder_memory,
-            # nnet_output is [N, T, C]
+                    memory_mask,
-
+                    token_ids=token_ids,
-        # NOTE: We need `encode_supervisions` to sort sequences with
+                    sos_id=graph_compiler.sos_id,
-        # different duration in decreasing order, required by
+                    eos_id=graph_compiler.eos_id,
-        # `k2.intersect_dense` called in `k2.ctc_loss`
+                )
-        supervision_segments, texts = encode_supervisions(
+            else:
-            supervisions, subsampling_factor=params.subsampling_factor
+                att_loss = model.decoder_forward(
-        )
+                    encoder_memory,
-
+                    memory_mask,
-        token_ids = graph_compiler.texts_to_ids(texts)
+                    token_ids=token_ids,
-
+                    sos_id=graph_compiler.sos_id,
-        decoding_graph = graph_compiler.compile(token_ids)
+                    eos_id=graph_compiler.eos_id,
-
+                )
-        dense_fsa_vec = k2.DenseFsaVec(
+        loss = (1.0 - params.att_rate) * ctc_loss + params.att_rate * att_loss
-            nnet_output,
+    else:
-            supervision_segments,
+        loss = ctc_loss
-            allow_truncate=params.subsampling_factor - 1,
+        att_loss = torch.tensor([0])
        )
        ctc_loss = k2.ctc_loss(
            decoding_graph=decoding_graph,
            dense_fsa_vec=dense_fsa_vec,
            output_beam=params.beam_size,
            reduction=params.reduction,
            use_double_scores=params.use_double_scores,
        )
        if params.att_rate != 0.0:
            with torch.set_grad_enabled(is_training):
                if hasattr(model, "module"):
                    att_loss = model.module.decoder_forward(
                        encoder_memory,
                        memory_mask,
                        token_ids=token_ids,
                        sos_id=graph_compiler.sos_id,
                        eos_id=graph_compiler.eos_id,
                    )
                else:
                    att_loss = model.decoder_forward(
                        encoder_memory,
                        memory_mask,
                        token_ids=token_ids,
                        sos_id=graph_compiler.sos_id,
                        eos_id=graph_compiler.eos_id,
                    )
            loss = (
                1.0 - params.att_rate
            ) * ctc_loss + params.att_rate * att_loss
        else:
            loss = ctc_loss
            att_loss = torch.tensor([0])
    except RuntimeError as ex:
        try:
            del nnet_output
            del encoder_memory
            del dense_fsa_vec
            del ctc_loss
            del att_loss
            del loss
        except NameError as ne:
            pass
        raise ex
    # train_frames and valid_frames are used for printing.
    if is_training:
@ -394,11 +378,6 @@ def compute_loss(
        s += f" max duration: {max_cut_duration:.3f} s \n"
        logging.info(s)
    # see https://github.com/pytorch/fairseq/blob/50a671f78d0c8de0392f924180db72ac9b41b801/fairseq/trainer.py#L283
    for p in model.parameters():
        if p.grad is not None:
            del p.grad  # free some memory
    torch.cuda.empty_cache()
    gc.collect()