Adding diagnostics code...

2025-12-10 06:25:27 +00:00 · 2022-02-27 13:44:43 +08:00 · 2022-02-27 13:44:43 +08:00 · 581786a6d3
commit 581786a6d3
parent 2af1b3af98
2 changed files with 313 additions and 11 deletions
--- a/egs/librispeech/ASR/transducer_stateless/diagnostics.py
+++ b/egs/librispeech/ASR/transducer_stateless/diagnostics.py
@ -0,0 +1,284 @@
+import torch
+from torch import Tensor
+from torch import nn
+import math
+import random
+from typing import Tuple, List
+
+
+class TensorDiagnosticOptions(object):
+    """
+    Options object for tensor diagnostics:
+
+     Args:
+        memory_limit: the maximum number of bytes per tensor (limits how many copies
+                of the tensor we cache).
+
+    """
+    def __init__(self, memory_limit: int,
+                 print_pos_ratio: bool = True):
+        self.memory_limit = memory_limit
+        self.print_pos_ratio = print_pos_ratio
+
+    def dim_is_summarized(self, size: int):
+        return size > 10 and size != 31
+
+    def stats_types(self):
+        if self.print_pos_ratio:
+            return ["mean-abs", "pos-ratio"]
+        else:
+            return ["mean-abs"]
+
+
+
+def get_sum_abs_stats(x: Tensor, dim: int,
+                      stats_type: str) -> Tuple[Tensor, int]:
+    """
+    Returns the sum-of-absolute-value of this Tensor, for each
+    index into the specified axis/dim of the tensor.
+    Args:
+       x: Tensor, tensor to be analyzed
+      dim: dimension with 0 <= dim < x.ndim
+      stats_type: either "mean-abs" in which case the stats represent the
+           mean absolute value, or "pos-ratio" in which case the
+           stats represent the proportion of positive values (actually:
+           the tensor is count of positive values, count is the count of
+           all values).
+    Returns (sum_abs, count)
+       where sum_abs is a Tensor of shape (x.shape[dim],), and the count
+       is an integer saying how many items were counted in each element
+       of sum_abs.
+    """
+    if stats_type == "mean-abs":
+        x = x.abs()
+    else:
+        assert stats_type == "pos-ratio"
+        x = (x > 0).to(dtype=torch.float)
+    orig_numel = x.numel()
+    sum_dims = [ d for d in range(x.ndim) if d != dim ]
+    x = torch.sum(x, dim=sum_dims)
+    count = orig_numel // x.numel()
+    x = x.flatten()
+    return x, count
+
+def get_diagnostics_for_dim(dim: int, tensors: List[Tensor],
+                            options: TensorDiagnosticOptions,
+                            sizes_same: bool,
+                            stats_type: str):
+    """
+    This function gets diagnostics for a dimension of a module.
+    Args:
+           dim: the dimension to analyze, with 0 <= dim < tensors[0].ndim
+       options: options object
+    sizes_same: true if all the tensor sizes are the same on this dimension
+    stats_type: either "mean-abs" or "pos-ratio", dictates the type of stats
+               we accumulate, mean-abs is mean absolute value, "pos-ratio"
+               is proportion of positive to nonnegative values.
+    Returns:
+     Diagnostic as a string, either percentiles or the actual values,
+     see the code.
+    """
+    # stats_and_counts is a list of pair (Tensor, int)
+    stats_and_counts = [ get_sum_abs_stats(x, dim, stats_type) for x in tensors ]
+    stats = [ x[0] for x in stats_and_counts ]
+    counts = [ x[1] for x in stats_and_counts ]
+    if sizes_same:
+        stats = torch.stack(stats).sum(dim=0)
+        count = sum(counts)
+        stats = stats / count
+    else:
+        stats = [ x[0] / x[1] for x in stats_and_counts ]
+        stats = torch.cat(stats, dim=0)
+    # if `summarize` we print percentiles of the stats; else,
+    # we print out individual elements.
+    summarize = (not sizes_same) or options.dim_is_summarized(stats.numel())
+    if summarize:
+        # print out percentiles.
+        stats = stats.sort()[0]
+        num_percentiles = 10
+        size = stats.numel()
+        percentiles = []
+        for i in range(num_percentiles + 1):
+            index = (i * (size - 1)) // num_percentiles
+            percentiles.append(stats[index].item())
+        percentiles = [ '%.2g' % x for x in percentiles ]
+        percentiles = ' '.join(percentiles)
+        return f'percentiles: [{percentiles}]'
+    else:
+        stats = stats.tolist()
+        stats = [ '%.2g' % x for x in stats ]
+        stats = '[' + ' '.join(stats) + ']'
+        return stats
+
+
+
+def print_diagnostics_for_dim(name: str, dim: int, tensors: List[Tensor],
+                              options: TensorDiagnosticOptions):
+
+    for stats_type in options.stats_types():
+        # stats_type will be "mean-abs" or "pos-ratio".
+        sizes = [ x.shape[dim] for x in tensors ]
+        sizes_same = all([ x == sizes[0] for x in sizes ])
+        s = get_diagnostics_for_dim(dim, tensors,
+                                    options, sizes_same,
+                                    stats_type)
+
+        min_size = min(sizes)
+        max_size = max(sizes)
+        size_str = f"{min_size}" if sizes_same else f"{min_size}..{max_size}"
+        # stats_type will be "mean-abs" or "pos-ratio".
+        print(f"module={name}, dim={dim}, size={size_str}, {stats_type} {s}")
+
+
+class TensorDiagnostic(object):
+    """
+    This class is not directly used by the user, it is responsible for collecting
+    diagnostics for a single parameter tensor of a torch.Module.
+    """
+    def __init__(self,
+                 opts: TensorDiagnosticOptions,
+                 name: str):
+        self.name = name
+        self.opts = opts
+        self.saved_tensors = []
+
+    def accumulate(self, x):
+        if isinstance(x, Tuple):
+            x = x[0]
+        if not isinstance(x, Tensor):
+            return
+        if x.device == torch.device('cpu'):
+            x = x.detach().clone()
+        else:
+            x = x.detach().to('cpu', non_blocking=True)
+        self.saved_tensors.append(x)
+        l = len(self.saved_tensors)
+        if l & (l - 1) == 0: # power of 2..
+            self._limit_memory()
+
+    def _limit_memory(self):
+        if len(self.saved_tensors) > 1024:
+            self.saved_tensors = self.saved_tensors[-1024:]
+            return
+
+        tot_mem = 0.0
+        for i in reversed(range(len(self.saved_tensors))):
+            tot_mem += self.saved_tensors[i].numel() * self.saved_tensors[i].element_size()
+            if tot_mem > self.opts.memory_limit:
+                self.saved_tensors = self.saved_tensors[i:]
+                return
+
+    def print_diagnostics(self):
+        if len(self.saved_tensors) == 0:
+            print("{name}: no stats".format(name=self.name))
+            return
+        if self.saved_tensors[0].ndim == 0:
+            # ensure there is at least one dim.
+            self.saved_tensors = [ x.unsqueeze(0) for x in self.saved_tensors ]
+
+        ndim = self.saved_tensors[0].ndim
+        for dim in range(ndim):
+            print_diagnostics_for_dim(self.name, dim,
+                                      self.saved_tensors,
+                                      self.opts)
+
+
+class ModelDiagnostic(object):
+    def __init__(self, opts: TensorDiagnosticOptions):
+        self.diagnostics = dict()
+        self.opts = opts
+
+    def __getitem__(self, name: str):
+        if name not in self.diagnostics:
+            self.diagnostics[name] = TensorDiagnostic(self.opts, name)
+        return self.diagnostics[name]
+
+    def print_diagnostics(self):
+        for k in sorted(self.diagnostics.keys()):
+            self.diagnostics[k].print_diagnostics()
+
+
+
+def attach_diagnostics(model: nn.Module,
+                       opts: TensorDiagnosticOptions) -> ModelDiagnostic:
+    ans = ModelDiagnostic(opts)
+    for name, module in model.named_modules():
+        if name == '':
+            name = "<top-level>"
+        forward_diagnostic = TensorDiagnostic(opts, name + ".output")
+        backward_diagnostic = TensorDiagnostic(opts, name + ".grad")
+
+
+        # setting model_diagnostic=ans and n=name below, instead of trying to capture the variables,
+        # ensures that we use the current values.  (matters for name, since
+        # the variable gets overwritten).  these closures don't really capture
+        # by value, only by "the final value the variable got in the function" :-(
+        def forward_hook(_module, _input, _output,
+                         _model_diagnostic=ans, _name=name):
+            if isinstance(_output, Tensor):
+                _model_diagnostic[f"{_name}.output"].accumulate(_output)
+            elif isinstance(_output, tuple):
+                for i, o in enumerate(_output):
+                    _model_diagnostic[f"{_name}.output[{i}]"].accumulate(o)
+
+        def backward_hook(_module, _input, _output,
+                          _model_diagnostic=ans, _name=name):
+            if isinstance(_output, Tensor):
+                _model_diagnostic[f"{_name}.grad"].accumulate(_output)
+            elif isinstance(_output, tuple):
+                for i, o in enumerate(_output):
+                    _model_diagnostic[f"{_name}.grad[{i}]"].accumulate(o)
+
+        module.register_forward_hook(forward_hook)
+        module.register_backward_hook(backward_hook)
+
+    for name, parameter in model.named_parameters():
+
+        def param_backward_hook(grad,
+                                _parameter=parameter,
+                                _model_diagnostic=ans,
+                                _name=name):
+            _model_diagnostic[f"{_name}.param_value"].accumulate(_parameter)
+            _model_diagnostic[f"{_name}.param_grad"].accumulate(grad)
+
+        parameter.register_hook(param_backward_hook)
+    return ans
+
+
+
+def _test_tensor_diagnostic():
+    opts = TensorDiagnosticOptions(2**20, True)
+
+    diagnostic = TensorDiagnostic(opts, "foo")
+
+    for _ in range(10):
+        diagnostic.accumulate(torch.randn(50, 100) * 10.0)
+
+    diagnostic.print_diagnostics()
+
+    model = nn.Sequential(nn.Linear(100, 50), nn.Linear(50, 80))
+
+    diagnostic = attach_diagnostics(model, opts)
+    for _ in range(10):
+        T = random.randint(200, 300)
+        x = torch.randn(T, 100)
+        y = model(x)
+        y.sum().backward()
+
+    diagnostic.print_diagnostics()
+
+
+
+if __name__ == '__main__':
+    _test_tensor_diagnostic()
+
+
+def _test_func():
+    ans = []
+    for i in range(10):
+        x = list()
+        x.append(i)
+        def func():
+            return x
+        ans.append(func)
+    return ans
--- a/egs/librispeech/ASR/transducer_stateless/train.py
+++ b/egs/librispeech/ASR/transducer_stateless/train.py
@ -34,6 +34,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"

 import argparse
 import logging
+import diagnostics # ./diagnostics.py
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@ -109,7 +110,7 @@ def get_parser():
    parser.add_argument(
        "--exp-dir",
        type=str,
-        default="transducer_stateless/exp-100h-specaugmod_p0.9_0.15_fix",
+        default="transducer_stateless/specaugmod_baseline",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
@ -138,6 +139,13 @@ def get_parser():
        "2 means tri-gram",
    )

+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
    return parser


@ -487,6 +495,9 @@ def train_one_epoch(
        loss.backward()
        clip_grad_norm_(model.parameters(), 5.0, 2.0)
        optimizer.step()
+        if params.print_diagnostics and batch_idx == 5:
+            return
+

        if batch_idx % params.log_interval == 0:
            logging.info(
@ -494,9 +505,6 @@ def train_one_epoch(
                f"batch {batch_idx}, loss[{loss_info}], "
                f"tot_loss[{tot_loss}], batch size: {batch_size}"
            )
-
-        if batch_idx % params.log_interval == 0:
-
            if tb_writer is not None:
                loss_info.write_summary(
                    tb_writer, "train/current_", params.batch_idx_train
@ -599,6 +607,11 @@ def run(rank, world_size, args):

    librispeech = LibriSpeechAsrDataModule(args)

+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(2**22)  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+
    train_cuts = librispeech.train_clean_100_cuts()
    if params.full_libri:
        train_cuts += librispeech.train_clean_360_cuts()
@ -626,13 +639,14 @@ def run(rank, world_size, args):
    valid_cuts += librispeech.dev_other_cuts()
    valid_dl = librispeech.valid_dataloaders(valid_cuts)

-    scan_pessimistic_batches_for_oom(
-        model=model,
-        train_dl=train_dl,
-        optimizer=optimizer,
-        sp=sp,
-        params=params,
-    )
+    if not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            sp=sp,
+            params=params,
+        )

    for epoch in range(params.start_epoch, params.num_epochs):
        train_dl.sampler.set_epoch(epoch)
@ -660,6 +674,10 @@ def run(rank, world_size, args):
            world_size=world_size,
        )

+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
        save_checkpoint(
            params=params,
            model=model,