From b36f3b5c52ee54d708166662df7b1d13300ada16 Mon Sep 17 00:00:00 2001
From: jinzr <60612200+JinZr@users.noreply.github.com>
Date: Sat, 2 Sep 2023 12:42:18 +0800
Subject: [PATCH] fixed formatting issues

---
 egs/multi_zh-hans/ASR/local/prepare_char.py   |   1 -
 egs/multi_zh-hans/ASR/zipformer/decoder.py    |  30 +-
 egs/multi_zh-hans/ASR/zipformer/joiner.py     |   9 +-
 egs/multi_zh-hans/ASR/zipformer/optim.py      |  10 +-
 egs/multi_zh-hans/ASR/zipformer/profile.py    |  12 +-
 egs/multi_zh-hans/ASR/zipformer/scaling.py    | 715 +++++++-------
 .../ASR/zipformer/streaming_decode.py         |   6 +-
 .../ASR/zipformer/subsampling.py              |  14 +-
 egs/multi_zh-hans/ASR/zipformer/zipformer.py  | 910 +++++++++++-------
 9 files changed, 972 insertions(+), 735 deletions(-)

diff --git a/egs/multi_zh-hans/ASR/local/prepare_char.py b/egs/multi_zh-hans/ASR/local/prepare_char.py
index 4eed4f596..d8622842f 100755
--- a/egs/multi_zh-hans/ASR/local/prepare_char.py
+++ b/egs/multi_zh-hans/ASR/local/prepare_char.py
@@ -240,4 +240,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/egs/multi_zh-hans/ASR/zipformer/decoder.py b/egs/multi_zh-hans/ASR/zipformer/decoder.py
index e8db988f6..e77e54118 100644
--- a/egs/multi_zh-hans/ASR/zipformer/decoder.py
+++ b/egs/multi_zh-hans/ASR/zipformer/decoder.py
@@ -61,10 +61,15 @@ class Decoder(nn.Module):
         )
         # the balancers are to avoid any drift in the magnitude of the
         # embeddings, which would interact badly with parameter averaging.
-        self.balancer = Balancer(decoder_dim, channel_dim=-1,
-                                 min_positive=0.0, max_positive=1.0,
-                                 min_abs=0.5, max_abs=1.0,
-                                 prob=0.05)
+        self.balancer = Balancer(
+            decoder_dim,
+            channel_dim=-1,
+            min_positive=0.0,
+            max_positive=1.0,
+            min_abs=0.5,
+            max_abs=1.0,
+            prob=0.05,
+        )
 
         self.blank_id = blank_id
 
@@ -81,10 +86,15 @@ class Decoder(nn.Module):
                 groups=decoder_dim // 4,  # group size == 4
                 bias=False,
             )
-            self.balancer2 = Balancer(decoder_dim, channel_dim=-1,
-                                      min_positive=0.0, max_positive=1.0,
-                                      min_abs=0.5, max_abs=1.0,
-                                      prob=0.05)
+            self.balancer2 = Balancer(
+                decoder_dim,
+                channel_dim=-1,
+                min_positive=0.0,
+                max_positive=1.0,
+                min_abs=0.5,
+                max_abs=1.0,
+                prob=0.05,
+            )
 
     def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
         """
@@ -107,9 +117,7 @@ class Decoder(nn.Module):
         if self.context_size > 1:
             embedding_out = embedding_out.permute(0, 2, 1)
             if need_pad is True:
-                embedding_out = F.pad(
-                    embedding_out, pad=(self.context_size - 1, 0)
-                )
+                embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0))
             else:
                 # During inference time, there is no need to do extra padding
                 # as we only need one output
diff --git a/egs/multi_zh-hans/ASR/zipformer/joiner.py b/egs/multi_zh-hans/ASR/zipformer/joiner.py
index f03cc930e..dfb0a0057 100644
--- a/egs/multi_zh-hans/ASR/zipformer/joiner.py
+++ b/egs/multi_zh-hans/ASR/zipformer/joiner.py
@@ -52,12 +52,13 @@ class Joiner(nn.Module):
         Returns:
           Return a tensor of shape (N, T, s_range, C).
         """
-        assert encoder_out.ndim == decoder_out.ndim, (encoder_out.shape, decoder_out.shape)
+        assert encoder_out.ndim == decoder_out.ndim, (
+            encoder_out.shape,
+            decoder_out.shape,
+        )
 
         if project_input:
-            logit = self.encoder_proj(encoder_out) + self.decoder_proj(
-                decoder_out
-            )
+            logit = self.encoder_proj(encoder_out) + self.decoder_proj(decoder_out)
         else:
             logit = encoder_out + decoder_out
 
diff --git a/egs/multi_zh-hans/ASR/zipformer/optim.py b/egs/multi_zh-hans/ASR/zipformer/optim.py
index abfb2092c..3c32d407e 100644
--- a/egs/multi_zh-hans/ASR/zipformer/optim.py
+++ b/egs/multi_zh-hans/ASR/zipformer/optim.py
@@ -299,8 +299,8 @@ class ScaledAdam(BatchedOptimizer):
             # the input is groups of parameter or named parameter.
             for cur_group in iterable_or_groups:
                 assert "named_params" in cur_group
-                name_list = [ x[0] for x in cur_group["named_params"] ]
-                p_list = [ x[1] for x in cur_group["named_params"] ]
+                name_list = [x[0] for x in cur_group["named_params"]]
+                p_list = [x[1] for x in cur_group["named_params"]]
                 del cur_group["named_params"]
                 cur_group["params"] = p_list
                 param_groups.append(cur_group)
@@ -667,8 +667,7 @@ class ScaledAdam(BatchedOptimizer):
         # We have to look at the trained model for parameters at or around the
         # param_max_rms, because sometimes they can indicate a problem with the
         # topology or settings.
-        scale_step = torch.minimum(scale_step,
-                                   (param_max_rms - param_rms) / param_rms)
+        scale_step = torch.minimum(scale_step, (param_max_rms - param_rms) / param_rms)
 
         delta = state["delta"]
         # the factor of (1-beta1) relates to momentum.
@@ -879,7 +878,8 @@ class Eden(LRScheduler):
         warmup_factor = (
             1.0
             if self.batch >= self.warmup_batches
-            else self.warmup_start + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
+            else self.warmup_start
+            + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
             # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
         )
 
diff --git a/egs/multi_zh-hans/ASR/zipformer/profile.py b/egs/multi_zh-hans/ASR/zipformer/profile.py
index b460b5338..57f44a90a 100755
--- a/egs/multi_zh-hans/ASR/zipformer/profile.py
+++ b/egs/multi_zh-hans/ASR/zipformer/profile.py
@@ -100,17 +100,13 @@ class Model(nn.Module):
         self.encoder_embed = encoder_embed
         self.encoder_proj = encoder_proj
 
-    def forward(
-        self, feature: Tensor, feature_lens: Tensor
-    ) -> Tuple[Tensor, Tensor]:
+    def forward(self, feature: Tensor, feature_lens: Tensor) -> Tuple[Tensor, Tensor]:
         x, x_lens = self.encoder_embed(feature, feature_lens)
 
         src_key_padding_mask = make_pad_mask(x_lens)
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
 
-        encoder_out, encoder_out_lens = self.encoder(
-            x, x_lens, src_key_padding_mask
-        )
+        encoder_out, encoder_out_lens = self.encoder(x, x_lens, src_key_padding_mask)
 
         encoder_out = encoder_out.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
         logits = self.encoder_proj(encoder_out)
@@ -168,9 +164,7 @@ def main():
 
 
 if __name__ == "__main__":
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)
 
     main()
diff --git a/egs/multi_zh-hans/ASR/zipformer/scaling.py b/egs/multi_zh-hans/ASR/zipformer/scaling.py
index 4ee7b7826..d5013be08 100644
--- a/egs/multi_zh-hans/ASR/zipformer/scaling.py
+++ b/egs/multi_zh-hans/ASR/zipformer/scaling.py
@@ -25,6 +25,7 @@ import math
 import torch.nn as nn
 from torch import Tensor
 
+
 def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
     max_value = torch.max(x, y)
     diff = torch.abs(x - y)
@@ -55,28 +56,34 @@ def logaddexp(x: Tensor, y: Tensor) -> Tensor:
         # for torch.jit.trace()
         return torch.logaddexp(x, y)
 
+
 class PiecewiseLinear(object):
     """
     Piecewise linear function, from float to float, specified as nonempty list of (x,y) pairs with
     the x values in order.  x values <[initial x] or >[final x] are map to [initial y], [final y]
     respectively.
     """
+
     def __init__(self, *args):
         assert len(args) >= 1, len(args)
         if len(args) == 1 and isinstance(args[0], PiecewiseLinear):
             self.pairs = list(args[0].pairs)
         else:
-            self.pairs = [ (float(x), float(y)) for x,y in args ]
-        for (x,y) in self.pairs:
+            self.pairs = [(float(x), float(y)) for x, y in args]
+        for (x, y) in self.pairs:
             assert isinstance(x, (float, int)), type(x)
             assert isinstance(y, (float, int)), type(y)
 
         for i in range(len(self.pairs) - 1):
-            assert self.pairs[i + 1][0] > self.pairs[i][0], (i, self.pairs[i], self.pairs[i + 1])
+            assert self.pairs[i + 1][0] > self.pairs[i][0], (
+                i,
+                self.pairs[i],
+                self.pairs[i + 1],
+            )
 
     def __str__(self):
         # e.g. 'PiecewiseLinear((0., 10.), (100., 0.))'
-        return f'PiecewiseLinear({str(self.pairs)[1:-1]})'
+        return f"PiecewiseLinear({str(self.pairs)[1:-1]})"
 
     def __call__(self, x):
         if x <= self.pairs[0][0]:
@@ -93,37 +100,36 @@ class PiecewiseLinear(object):
             assert False
 
     def __mul__(self, alpha):
-        return PiecewiseLinear(
-            * [(x, y * alpha) for x, y in self.pairs])
+        return PiecewiseLinear(*[(x, y * alpha) for x, y in self.pairs])
 
     def __add__(self, x):
         if isinstance(x, (float, int)):
-            return PiecewiseLinear(
-                * [(p[0], p[1] + x) for p in self.pairs])
+            return PiecewiseLinear(*[(p[0], p[1] + x) for p in self.pairs])
         s, x = self.get_common_basis(x)
         return PiecewiseLinear(
-            * [(sp[0], sp[1] + xp[1]) for sp, xp in zip(s.pairs, x.pairs)])
+            *[(sp[0], sp[1] + xp[1]) for sp, xp in zip(s.pairs, x.pairs)]
+        )
 
     def max(self, x):
         if isinstance(x, (float, int)):
-            x = PiecewiseLinear( (0, x) )
+            x = PiecewiseLinear((0, x))
         s, x = self.get_common_basis(x, include_crossings=True)
         return PiecewiseLinear(
-            * [(sp[0], max(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)])
+            *[(sp[0], max(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
+        )
 
     def min(self, x):
         if isinstance(x, float) or isinstance(x, int):
-            x = PiecewiseLinear( (0, x) )
+            x = PiecewiseLinear((0, x))
         s, x = self.get_common_basis(x, include_crossings=True)
         return PiecewiseLinear(
-            * [ (sp[0], min(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)])
+            *[(sp[0], min(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
+        )
 
     def __eq__(self, other):
         return self.pairs == other.pairs
 
-    def get_common_basis(self,
-                         p: 'PiecewiseLinear',
-                         include_crossings: bool = False):
+    def get_common_basis(self, p: "PiecewiseLinear", include_crossings: bool = False):
         """
         Returns (self_mod, p_mod) which are equivalent piecewise lienar
         functions to self and p, but with the same x values.
@@ -135,28 +141,30 @@ class PiecewiseLinear(object):
         assert isinstance(p, PiecewiseLinear), type(p)
 
         # get sorted x-values without repetition.
-        x_vals = sorted(set([ x for x, _ in self.pairs ] + [ x for x, _ in p.pairs ]))
-        y_vals1 = [ self(x) for x in x_vals ]
-        y_vals2 = [ p(x) for x in x_vals ]
+        x_vals = sorted(set([x for x, _ in self.pairs] + [x for x, _ in p.pairs]))
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
 
         if include_crossings:
             extra_x_vals = []
             for i in range(len(x_vals) - 1):
-                if (y_vals1[i] > y_vals2[i]) != (y_vals1[i+1] > y_vals2[i+1]):
+                if (y_vals1[i] > y_vals2[i]) != (y_vals1[i + 1] > y_vals2[i + 1]):
                     # if the two lines in this subsegment potentially cross each other..
                     diff_cur = abs(y_vals1[i] - y_vals2[i])
-                    diff_next = abs(y_vals1[i+1] - y_vals2[i+1])
+                    diff_next = abs(y_vals1[i + 1] - y_vals2[i + 1])
                     # `pos`, between 0 and 1, gives the relative x position,
                     # with 0 being x_vals[i] and 1 being x_vals[i+1].
                     pos = diff_cur / (diff_cur + diff_next)
-                    extra_x_val = x_vals[i] + pos * (x_vals[i+1] - x_vals[i])
+                    extra_x_val = x_vals[i] + pos * (x_vals[i + 1] - x_vals[i])
                     extra_x_vals.append(extra_x_val)
             if len(extra_x_vals) > 0:
                 x_vals = sorted(set(x_vals + extra_x_vals))
-        y_vals1 = [ self(x) for x in x_vals ]
-        y_vals2 = [ p(x) for x in x_vals ]
-        return ( PiecewiseLinear(* zip(x_vals, y_vals1)),
-                 PiecewiseLinear(* zip(x_vals, y_vals2)) )
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
+        return (
+            PiecewiseLinear(*zip(x_vals, y_vals1)),
+            PiecewiseLinear(*zip(x_vals, y_vals2)),
+        )
 
 
 class ScheduledFloat(torch.nn.Module):
@@ -176,9 +184,8 @@ class ScheduledFloat(torch.nn.Module):
     `default` is used when self.batch_count is not set or not in training mode or in
      torch.jit scripting mode.
     """
-    def __init__(self,
-                 *args,
-                 default: float = 0.0):
+
+    def __init__(self, *args, default: float = 0.0):
         super().__init__()
         # self.batch_count and self.name will be written to in the training loop.
         self.batch_count = None
@@ -187,47 +194,55 @@ class ScheduledFloat(torch.nn.Module):
         self.schedule = PiecewiseLinear(*args)
 
     def extra_repr(self) -> str:
-        return f'batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}'
+        return (
+            f"batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}"
+        )
 
     def __float__(self):
         batch_count = self.batch_count
-        if batch_count is None or not self.training or torch.jit.is_scripting() or torch.jit.is_tracing():
+        if (
+            batch_count is None
+            or not self.training
+            or torch.jit.is_scripting()
+            or torch.jit.is_tracing()
+        ):
             return float(self.default)
         else:
             ans = self.schedule(self.batch_count)
             if random.random() < 0.0002:
-                logging.info(f"ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}")
+                logging.info(
+                    f"ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}"
+                )
             return ans
 
     def __add__(self, x):
         if isinstance(x, float) or isinstance(x, int):
-            return ScheduledFloat(self.schedule + x,
-                                  default=self.default)
+            return ScheduledFloat(self.schedule + x, default=self.default)
         else:
-            return ScheduledFloat(self.schedule + x.schedule,
-                                  default=self.default+x.default)
+            return ScheduledFloat(
+                self.schedule + x.schedule, default=self.default + x.default
+            )
 
     def max(self, x):
         if isinstance(x, float) or isinstance(x, int):
-            return ScheduledFloat(self.schedule.max(x),
-                                  default=self.default)
+            return ScheduledFloat(self.schedule.max(x), default=self.default)
         else:
-            return ScheduledFloat(self.schedule.max(x.schedule),
-                                  default=max(self.default, x.default))
+            return ScheduledFloat(
+                self.schedule.max(x.schedule), default=max(self.default, x.default)
+            )
 
 
 FloatLike = Union[float, ScheduledFloat]
 
 
-def random_cast_to_half(x: Tensor,
-                        min_abs: float = 5.0e-06) -> Tensor:
+def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor:
     """
     A randomized way of casting a floating point value to half precision.
     """
     if x.dtype == torch.float16:
         return x
     x_abs = x.abs()
-    is_too_small = (x_abs < min_abs)
+    is_too_small = x_abs < min_abs
     # for elements where is_too_small is true, random_val will contain +-min_abs with
     # probability (x.abs() / min_abs), and 0.0 otherwise.  [so this preserves expectations,
     # for those elements].
@@ -242,6 +257,7 @@ class CutoffEstimator:
 
       p is the proportion of items that should be above the cutoff.
     """
+
     def __init__(self, p: float):
         self.p = p
         # total count of items
@@ -255,7 +271,7 @@ class CutoffEstimator:
         """
         Returns true if x is above the cutoff.
         """
-        ans = (x > self.cutoff)
+        ans = x > self.cutoff
         self.count += 1
         if ans:
             self.count_above += 1
@@ -263,7 +279,7 @@ class CutoffEstimator:
         delta_p = cur_p - self.p
         if (delta_p > 0) == ans:
             q = abs(delta_p)
-            self.cutoff = x * q + self.cutoff * (1-q)
+            self.cutoff = x * q + self.cutoff * (1 - q)
         return ans
 
 
@@ -272,6 +288,7 @@ class SoftmaxFunction(torch.autograd.Function):
     Tries to handle half-precision derivatives in a randomized way that should
     be more accurate for training than the default behavior.
     """
+
     @staticmethod
     def forward(ctx, x: Tensor, dim: int):
         ans = x.softmax(dim=dim)
@@ -287,7 +304,7 @@ class SoftmaxFunction(torch.autograd.Function):
 
     @staticmethod
     def backward(ctx, ans_grad: Tensor):
-        ans, = ctx.saved_tensors
+        (ans,) = ctx.saved_tensors
         with torch.cuda.amp.autocast(enabled=False):
             ans_grad = ans_grad.to(torch.float32)
             ans = ans.to(torch.float32)
@@ -306,17 +323,16 @@ def softmax(x: Tensor, dim: int):
 class MaxEigLimiterFunction(torch.autograd.Function):
     @staticmethod
     def forward(
-            ctx,
-            x: Tensor,
-            coeffs: Tensor,
-            direction: Tensor,
-            channel_dim: int,
-            grad_scale: float) -> Tensor:
+        ctx,
+        x: Tensor,
+        coeffs: Tensor,
+        direction: Tensor,
+        channel_dim: int,
+        grad_scale: float,
+    ) -> Tensor:
         ctx.channel_dim = channel_dim
         ctx.grad_scale = grad_scale
-        ctx.save_for_backward(x.detach(),
-                              coeffs.detach(),
-                              direction.detach())
+        ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach())
         return x
 
     @staticmethod
@@ -328,15 +344,20 @@ class MaxEigLimiterFunction(torch.autograd.Function):
             x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels)
             new_direction.requires_grad = False
             x = x - x.mean(dim=0)
-            x_var = (x ** 2).mean()
+            x_var = (x**2).mean()
             x_residual = x - coeffs * new_direction
-            x_residual_var = (x_residual ** 2).mean()
+            x_residual_var = (x_residual**2).mean()
             # `variance_proportion` is the proportion of the variance accounted for
             # by the top eigen-direction.  This is to be minimized.
             variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
             variance_proportion.backward()
         x_orig_grad = x_orig.grad
-        x_extra_grad = x_orig.grad * ctx.grad_scale * x_grad.norm() / (x_orig_grad.norm() + 1.0e-20)
+        x_extra_grad = (
+            x_orig.grad
+            * ctx.grad_scale
+            * x_grad.norm()
+            / (x_orig_grad.norm() + 1.0e-20)
+        )
         return x_grad + x_extra_grad.detach(), None, None, None, None
 
 
@@ -348,8 +369,14 @@ class BiasNormFunction(torch.autograd.Function):
     # it can just store the returned value (chances are, this will also be needed for
     # some other reason, related to the next operation, so we can save memory).
     @staticmethod
-    def forward(ctx, x: Tensor, bias: Tensor, log_scale: Tensor, channel_dim: int,
-                store_output_for_backprop: bool) -> Tensor:
+    def forward(
+        ctx,
+        x: Tensor,
+        bias: Tensor,
+        log_scale: Tensor,
+        channel_dim: int,
+        store_output_for_backprop: bool,
+    ) -> Tensor:
         assert bias.ndim == 1
         if channel_dim < 0:
             channel_dim = channel_dim + x.ndim
@@ -357,10 +384,16 @@ class BiasNormFunction(torch.autograd.Function):
         ctx.channel_dim = channel_dim
         for _ in range(channel_dim + 1, x.ndim):
             bias = bias.unsqueeze(-1)
-        scales = (torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5) * log_scale.exp()
+        scales = (
+            torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
+        ) * log_scale.exp()
         ans = x * scales
-        ctx.save_for_backward(ans.detach() if store_output_for_backprop else x,
-                              scales.detach(), bias.detach(), log_scale.detach())
+        ctx.save_for_backward(
+            ans.detach() if store_output_for_backprop else x,
+            scales.detach(),
+            bias.detach(),
+            log_scale.detach(),
+        )
         return ans
 
     @staticmethod
@@ -376,7 +409,9 @@ class BiasNormFunction(torch.autograd.Function):
         log_scale.requires_grad = True
         with torch.enable_grad():
             # recompute scales from x, bias and log_scale.
-            scales = (torch.mean((x - bias) ** 2, dim=ctx.channel_dim, keepdim=True) ** -0.5) * log_scale.exp()
+            scales = (
+                torch.mean((x - bias) ** 2, dim=ctx.channel_dim, keepdim=True) ** -0.5
+            ) * log_scale.exp()
             ans = x * scales
             ans.backward(gradient=ans_grad)
         return x.grad, bias.grad.flatten(), log_scale.grad, None, None
@@ -412,14 +447,15 @@ class BiasNorm(torch.nn.Module):
          than the input of this module to be required to be stored for the
          backprop.
     """
+
     def __init__(
-            self,
-            num_channels: int,
-            channel_dim: int = -1,  # CAUTION: see documentation.
-            log_scale: float = 1.0,
-            log_scale_min: float = -1.5,
-            log_scale_max: float = 1.5,
-            store_output_for_backprop: bool = False
+        self,
+        num_channels: int,
+        channel_dim: int = -1,  # CAUTION: see documentation.
+        log_scale: float = 1.0,
+        log_scale_min: float = -1.5,
+        log_scale_max: float = 1.5,
+        store_output_for_backprop: bool = False,
     ) -> None:
         super(BiasNorm, self).__init__()
         self.num_channels = num_channels
@@ -442,23 +478,24 @@ class BiasNorm(torch.nn.Module):
             bias = self.bias
             for _ in range(channel_dim + 1, x.ndim):
                 bias = bias.unsqueeze(-1)
-            scales = ((torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5) *
-                      self.log_scale.exp())
+            scales = (
+                torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
+            ) * self.log_scale.exp()
             return x * scales
 
-        log_scale = limit_param_value(self.log_scale,
-                                      min=float(self.log_scale_min),
-                                      max=float(self.log_scale_max),
-                                      training=self.training)
+        log_scale = limit_param_value(
+            self.log_scale,
+            min=float(self.log_scale_min),
+            max=float(self.log_scale_max),
+            training=self.training,
+        )
 
-        return BiasNormFunction.apply(x, self.bias, log_scale,
-                                      self.channel_dim,
-                                      self.store_output_for_backprop)
+        return BiasNormFunction.apply(
+            x, self.bias, log_scale, self.channel_dim, self.store_output_for_backprop
+        )
 
 
-def ScaledLinear(*args,
-                 initial_scale: float = 1.0,
-                 **kwargs) -> nn.Linear:
+def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
     """
     Behaves like a constructor of a modified version of nn.Linear
     that gives an easy way to set the default initial parameter scale.
@@ -477,15 +514,11 @@ def ScaledLinear(*args,
     with torch.no_grad():
         ans.weight[:] *= initial_scale
         if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias,
-                                   -0.1 * initial_scale,
-                                   0.1 * initial_scale)
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
     return ans
 
 
-def ScaledConv1d(*args,
-                 initial_scale: float = 1.0,
-                 **kwargs) -> nn.Conv1d:
+def ScaledConv1d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv1d:
     """
     Behaves like a constructor of a modified version of nn.Conv1d
     that gives an easy way to set the default initial parameter scale.
@@ -504,15 +537,11 @@ def ScaledConv1d(*args,
     with torch.no_grad():
         ans.weight[:] *= initial_scale
         if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias,
-                                   -0.1 * initial_scale,
-                                   0.1 * initial_scale)
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
     return ans
 
 
-def ScaledConv2d(*args,
-                 initial_scale: float = 1.0,
-                 **kwargs) -> nn.Conv2d:
+def ScaledConv2d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv2d:
     """
     Behaves like a constructor of a modified version of nn.Conv2d
     that gives an easy way to set the default initial parameter scale.
@@ -532,9 +561,7 @@ def ScaledConv2d(*args,
     with torch.no_grad():
         ans.weight[:] *= initial_scale
         if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias,
-                                   -0.1 * initial_scale,
-                                   0.1 * initial_scale)
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
     return ans
 
 
@@ -562,29 +589,36 @@ class ChunkCausalDepthwiseConv1d(torch.nn.Module):
            Another option, if you want to do something like this, is
            to re-initialize the parameters.
     """
-    def __init__(self,
-                 channels: int,
-                 kernel_size: int,
-                 initial_scale: float = 1.0,
-                 bias: bool = True):
+
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int,
+        initial_scale: float = 1.0,
+        bias: bool = True,
+    ):
         super().__init__()
         assert kernel_size % 2 == 1
 
         half_kernel_size = (kernel_size + 1) // 2
         # will pad manually, on one side.
-        self.causal_conv = nn.Conv1d(in_channels=channels,
-                                     out_channels=channels,
-                                     groups=channels,
-                                     kernel_size=half_kernel_size,
-                                     padding=0,
-                                     bias=True)
+        self.causal_conv = nn.Conv1d(
+            in_channels=channels,
+            out_channels=channels,
+            groups=channels,
+            kernel_size=half_kernel_size,
+            padding=0,
+            bias=True,
+        )
 
-        self.chunkwise_conv = nn.Conv1d(in_channels=channels,
-                                        out_channels=channels,
-                                        groups=channels,
-                                        kernel_size=kernel_size,
-                                        padding=kernel_size // 2,
-                                        bias=bias)
+        self.chunkwise_conv = nn.Conv1d(
+            in_channels=channels,
+            out_channels=channels,
+            groups=channels,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            bias=bias,
+        )
 
         # first row is correction factors added to the scale near the left edge of the chunk,
         # second row is correction factors added to the scale near the right edge of the chunk,
@@ -596,17 +630,15 @@ class ChunkCausalDepthwiseConv1d(torch.nn.Module):
             self.causal_conv.weight[:] *= initial_scale
             self.chunkwise_conv.weight[:] *= initial_scale
             if bias:
-                torch.nn.init.uniform_(self.causal_conv.bias,
-                                       -0.1 * initial_scale,
-                                       0.1 * initial_scale)
+                torch.nn.init.uniform_(
+                    self.causal_conv.bias, -0.1 * initial_scale, 0.1 * initial_scale
+                )
 
-    def forward(self,
-                x: Tensor,
-                chunk_size: int = -1) -> Tensor:
+    def forward(self, x: Tensor, chunk_size: int = -1) -> Tensor:
         """
-        Forward function.  Args:
-          x: a Tensor of shape (batch_size, channels, seq_len)
-   chunk_size: the chunk size, in frames; does not have to divide seq_len exactly.
+             Forward function.  Args:
+               x: a Tensor of shape (batch_size, channels, seq_len)
+        chunk_size: the chunk size, in frames; does not have to divide seq_len exactly.
         """
         (batch_size, num_channels, seq_len) = x.shape
 
@@ -622,28 +654,32 @@ class ChunkCausalDepthwiseConv1d(torch.nn.Module):
 
         x = torch.nn.functional.pad(x, (left_pad, right_pad))
 
-        x_causal = self.causal_conv(x[..., :left_pad + seq_len])
+        x_causal = self.causal_conv(x[..., : left_pad + seq_len])
         assert x_causal.shape == (batch_size, num_channels, seq_len)
 
         x_chunk = x[..., left_pad:]
         num_chunks = x_chunk.shape[2] // chunk_size
         x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks, chunk_size)
-        x_chunk = x_chunk.permute(0, 2, 1, 3).reshape(batch_size * num_chunks,
-                                                      num_channels, chunk_size)
+        x_chunk = x_chunk.permute(0, 2, 1, 3).reshape(
+            batch_size * num_chunks, num_channels, chunk_size
+        )
         x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
 
         chunk_scale = self._get_chunk_scale(chunk_size)
 
         x_chunk = x_chunk * chunk_scale
-        x_chunk = x_chunk.reshape(batch_size, num_chunks,
-                                  num_channels, chunk_size).permute(0, 2, 1, 3)
-        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks * chunk_size)[..., :seq_len]
+        x_chunk = x_chunk.reshape(
+            batch_size, num_chunks, num_channels, chunk_size
+        ).permute(0, 2, 1, 3)
+        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks * chunk_size)[
+            ..., :seq_len
+        ]
 
         return x_chunk + x_causal
 
     def _get_chunk_scale(self, chunk_size: int):
         """Returns tensor of shape (num_channels, chunk_size) that will be used to
-           scale the output of self.chunkwise_conv."""
+        scale the output of self.chunkwise_conv."""
         left_edge = self.chunkwise_conv_scale[0]
         right_edge = self.chunkwise_conv_scale[1]
         if chunk_size < self.kernel_size:
@@ -652,9 +688,9 @@ class ChunkCausalDepthwiseConv1d(torch.nn.Module):
         else:
             t = chunk_size - self.kernel_size
             channels = left_edge.shape[0]
-            pad = torch.zeros(channels, t,
-                              device=left_edge.device,
-                              dtype=left_edge.dtype)
+            pad = torch.zeros(
+                channels, t, device=left_edge.device, dtype=left_edge.dtype
+            )
             left_edge = torch.cat((left_edge, pad), dim=-1)
             right_edge = torch.cat((pad, right_edge), dim=-1)
         return 1.0 + (left_edge + right_edge)
@@ -698,14 +734,14 @@ class ChunkCausalDepthwiseConv1d(torch.nn.Module):
 class BalancerFunction(torch.autograd.Function):
     @staticmethod
     def forward(
-            ctx,
-            x: Tensor,
-            min_mean: float,
-            max_mean: float,
-            min_rms: float,
-            max_rms: float,
-            grad_scale: float,
-            channel_dim: int,
+        ctx,
+        x: Tensor,
+        min_mean: float,
+        max_mean: float,
+        min_rms: float,
+        max_rms: float,
+        grad_scale: float,
+        channel_dim: int,
     ) -> Tensor:
         if channel_dim < 0:
             channel_dim += x.ndim
@@ -715,10 +751,8 @@ class BalancerFunction(torch.autograd.Function):
         return x
 
     @staticmethod
-    def backward(
-        ctx, x_grad: Tensor
-    ) -> Tuple[Tensor, None, None, None, None, None]:
-        x, = ctx.saved_tensors
+    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None, None, None]:
+        (x,) = ctx.saved_tensors
         (min_mean, max_mean, min_rms, max_rms, grad_scale, channel_dim) = ctx.config
 
         try:
@@ -727,8 +761,8 @@ class BalancerFunction(torch.autograd.Function):
                     x = x.to(torch.float32)
                     x = x.detach()
                     x.requires_grad = True
-                    mean_dims = [ i for i in range(x.ndim) if i != channel_dim ]
-                    uncentered_var = (x ** 2).mean(dim=mean_dims, keepdim=True)
+                    mean_dims = [i for i in range(x.ndim) if i != channel_dim]
+                    uncentered_var = (x**2).mean(dim=mean_dims, keepdim=True)
                     mean = x.mean(dim=mean_dims, keepdim=True)
                     stddev = (uncentered_var - (mean * mean)).clamp(min=1.0e-20).sqrt()
                     rms = uncentered_var.clamp(min=1.0e-20).sqrt()
@@ -742,11 +776,16 @@ class BalancerFunction(torch.autograd.Function):
                     rms_clamped = rms.clamp(min=min_rms, max=max_rms)
                     r_loss = (rms_clamped / rms).log().abs()
 
-                    loss = (m_loss + r_loss)
+                    loss = m_loss + r_loss
 
                     loss.backward(gradient=torch.ones_like(loss))
                     loss_grad = x.grad
-                    loss_grad_rms = (loss_grad ** 2).mean(dim=mean_dims, keepdim=True).sqrt().clamp(min=1.0e-20)
+                    loss_grad_rms = (
+                        (loss_grad**2)
+                        .mean(dim=mean_dims, keepdim=True)
+                        .sqrt()
+                        .clamp(min=1.0e-20)
+                    )
 
                     loss_grad = loss_grad * (grad_scale / loss_grad_rms)
 
@@ -757,7 +796,9 @@ class BalancerFunction(torch.autograd.Function):
                     x_grad_mod = x_grad_float + (x_grad_float.abs() * loss_grad)
                     x_grad = x_grad_mod.to(x_grad.dtype)
         except Exception as e:
-            logging.info(f"Caught exception in Balancer backward: {e}, size={list(x_grad.shape)}, will continue.")
+            logging.info(
+                f"Caught exception in Balancer backward: {e}, size={list(x_grad.shape)}, will continue."
+            )
 
         return x_grad, None, None, None, None, None, None
 
@@ -793,16 +834,17 @@ class Balancer(torch.nn.Module):
              on each forward().  This is done randomly to prevent all layers
              from doing it at the same time.
     """
+
     def __init__(
-            self,
-            num_channels: int,
-            channel_dim: int,
-            min_positive: FloatLike = 0.05,
-            max_positive: FloatLike = 0.95,
-            min_abs: FloatLike = 0.2,
-            max_abs: FloatLike = 100.0,
-            grad_scale: FloatLike = 0.04,
-            prob: Optional[FloatLike] = None,
+        self,
+        num_channels: int,
+        channel_dim: int,
+        min_positive: FloatLike = 0.05,
+        max_positive: FloatLike = 0.95,
+        min_abs: FloatLike = 0.2,
+        max_abs: FloatLike = 100.0,
+        grad_scale: FloatLike = 0.04,
+        prob: Optional[FloatLike] = None,
     ):
         super().__init__()
 
@@ -823,8 +865,11 @@ class Balancer(torch.nn.Module):
         self.grad_scale = grad_scale
 
     def forward(self, x: Tensor) -> Tensor:
-        if (torch.jit.is_scripting() or not x.requires_grad or
-           (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated()))):
+        if (
+            torch.jit.is_scripting()
+            or not x.requires_grad
+            or (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated()))
+        ):
             return _no_op(x)
 
         prob = float(self.prob)
@@ -842,7 +887,7 @@ class Balancer(torch.nn.Module):
                     eps = 1.0e-10
                     # eps is to prevent crashes if x is exactly 0 or 1.
                     # we'll just end up returning a fairly large value.
-                    return (math.log (1+x+eps) - math.log (1-x+eps)) / 2.
+                    return (math.log(1 + x + eps) - math.log(1 - x + eps)) / 2.0
 
                 def _approx_inverse_erf(x):
                     # 1 / (sqrt(pi) * ln(2)),
@@ -853,6 +898,7 @@ class Balancer(torch.nn.Module):
                     # and math.erf(0.0407316414078772) = 0.045935330944660666,
                     # which is pretty close to 0.05.
                     return 0.8139535143 * _atanh(x)
+
                 # first convert x from the range 0..1 to the range -1..1 which the error
                 # function returns
                 x = -1 + (2 * x)
@@ -873,8 +919,9 @@ class Balancer(torch.nn.Module):
             return _no_op(x)
 
 
-def penalize_abs_values_gt(x: Tensor, limit: float, penalty: float,
-                           name: str = None) -> Tensor:
+def penalize_abs_values_gt(
+    x: Tensor, limit: float, penalty: float, name: str = None
+) -> Tensor:
     """
     Returns x unmodified, but in backprop will put a penalty for the excess of
     the absolute values of elements of x over the limit "limit".  E.g. if
@@ -910,13 +957,12 @@ def _diag(x: Tensor):  # like .diag(), but works for tensors with 3 dims.
     else:
         (batch, dim, dim) = x.shape
         x = x.reshape(batch, dim * dim)
-        x = x[:, ::dim+1]
+        x = x[:, :: dim + 1]
         assert x.shape == (batch, dim)
         return x
 
 
-def _whitening_metric(x: Tensor,
-                      num_groups: int):
+def _whitening_metric(x: Tensor, num_groups: int):
     """
     Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
     of the centered feature covariance are the same within each group's covariance matrix
@@ -946,25 +992,22 @@ def _whitening_metric(x: Tensor,
     # the following expression is what we'd get if we took the matrix product
     # of each covariance and measured the mean of its trace, i.e.
     # the same as _diag(torch.matmul(x_covar, x_covar)).mean().
-    x_covarsq_mean_diag = (x_covar ** 2).sum() / (num_groups * channels_per_group)
+    x_covarsq_mean_diag = (x_covar**2).sum() / (num_groups * channels_per_group)
     # this metric will be >= 1.0; the larger it is, the less 'white' the data was.
-    metric = x_covarsq_mean_diag / (x_covar_mean_diag ** 2 + 1.0e-20)
+    metric = x_covarsq_mean_diag / (x_covar_mean_diag**2 + 1.0e-20)
     return metric
 
 
 class WhiteningPenaltyFunction(torch.autograd.Function):
     @staticmethod
-    def forward(ctx,
-                x: Tensor,
-                module: nn.Module) -> Tensor:
+    def forward(ctx, x: Tensor, module: nn.Module) -> Tensor:
         ctx.save_for_backward(x)
         ctx.module = module
         return x
 
     @staticmethod
-    def backward(ctx,
-                 x_grad: Tensor):
-        x_orig, = ctx.saved_tensors
+    def backward(ctx, x_grad: Tensor):
+        (x_orig,) = ctx.saved_tensors
         w = ctx.module
 
         try:
@@ -976,8 +1019,10 @@ class WhiteningPenaltyFunction(torch.autograd.Function):
                     metric = _whitening_metric(x_detached, w.num_groups)
 
                     if random.random() < 0.005 or __name__ == "__main__":
-                        logging.info(f"Whitening: name={w.name}, num_groups={w.num_groups}, num_channels={x_orig.shape[-1]}, "
-                                     f"metric={metric.item():.2f} vs. limit={float(w.whitening_limit)}")
+                        logging.info(
+                            f"Whitening: name={w.name}, num_groups={w.num_groups}, num_channels={x_orig.shape[-1]}, "
+                            f"metric={metric.item():.2f} vs. limit={float(w.whitening_limit)}"
+                        )
 
                     if metric < float(w.whitening_limit):
                         w.prob = w.min_prob
@@ -986,22 +1031,27 @@ class WhiteningPenaltyFunction(torch.autograd.Function):
                         w.prob = w.max_prob
                         metric.backward()
                         penalty_grad = x_detached.grad
-                        scale = w.grad_scale * (x_grad.to(torch.float32).norm() /
-                                                (penalty_grad.norm() + 1.0e-20))
+                        scale = w.grad_scale * (
+                            x_grad.to(torch.float32).norm()
+                            / (penalty_grad.norm() + 1.0e-20)
+                        )
                         penalty_grad = penalty_grad * scale
                         return x_grad + penalty_grad.to(x_grad.dtype), None
         except Exception as e:
-            logging.info(f"Caught exception in Whiten backward: {e}, size={list(x_grad.shape)}, will continue.")
+            logging.info(
+                f"Caught exception in Whiten backward: {e}, size={list(x_grad.shape)}, will continue."
+            )
         return x_grad, None
 
 
 class Whiten(nn.Module):
     def __init__(
-            self,
-            num_groups: int,
-            whitening_limit: FloatLike,
-            prob: Union[float, Tuple[float,float]],
-            grad_scale: FloatLike):
+        self,
+        num_groups: int,
+        whitening_limit: FloatLike,
+        prob: Union[float, Tuple[float, float]],
+        grad_scale: FloatLike,
+    ):
         """
         Args:
           num_groups: the number of groups to divide the channel dim into before
@@ -1033,10 +1083,9 @@ class Whiten(nn.Module):
         (self.min_prob, self.max_prob) = prob
         assert 0 < self.min_prob <= self.max_prob <= 1
         self.prob = self.max_prob
-        self.name = None # will be set in training loop
+        self.name = None  # will be set in training loop
 
-    def forward(self,
-                x: Tensor) -> Tensor:
+    def forward(self, x: Tensor) -> Tensor:
         """
         In the forward pass, this function just returns the input unmodified.
         In the backward pass, it will modify the gradients to ensure that the
@@ -1071,9 +1120,11 @@ class WithLoss(torch.autograd.Function):
 
     @staticmethod
     def backward(ctx, ans_grad: Tensor):
-        return ans_grad, torch.ones(ctx.y_shape,
-                                    dtype=ans_grad.dtype,
-                                    device=ans_grad.device), None
+        return (
+            ans_grad,
+            torch.ones(ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device),
+            None,
+        )
 
 
 def with_loss(x, y, name):
@@ -1118,20 +1169,21 @@ class LimitParamValue(torch.autograd.Function):
 
     @staticmethod
     def backward(ctx, x_grad: Tensor):
-        x, = ctx.saved_tensors
+        (x,) = ctx.saved_tensors
         # where x < ctx.min, ensure all grads are negative (this will tend to make
         # x more positive).
-        x_grad = x_grad * torch.where(torch.logical_and(x_grad > 0, x < ctx.min), -1.0, 1.0)
+        x_grad = x_grad * torch.where(
+            torch.logical_and(x_grad > 0, x < ctx.min), -1.0, 1.0
+        )
         # where x > ctx.max, ensure all grads are positive (this will tend to make
         # x more negative).
         x_grad *= torch.where(torch.logical_and(x_grad < 0, x > ctx.max), -1.0, 1.0)
         return x_grad, None, None
 
 
-def limit_param_value(x: Tensor,
-                      min: float, max: float,
-                      prob: float = 0.6,
-                      training: bool = True):
+def limit_param_value(
+    x: Tensor, min: float, max: float, prob: float = 0.6, training: bool = True
+):
     # You apply this to (typically) an nn.Parameter during training to ensure that its
     # (elements mostly) stays within a supplied range.  This is done by modifying the
     # gradients in backprop.
@@ -1187,7 +1239,7 @@ class DoubleSwishFunction(torch.autograd.Function):
         y = x * s
 
         if requires_grad:
-            deriv = (y * (1 - s) + s)
+            deriv = y * (1 - s) + s
 
             # notes on derivative of x * sigmoid(x - 1):
             # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
@@ -1197,7 +1249,9 @@ class DoubleSwishFunction(torch.autograd.Function):
             # floors), should be expectation-preserving.
             floor = -0.044
             ceil = 1.2
-            d_scaled = ((deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(deriv))
+            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
+                deriv
+            )
             if __name__ == "__main__":
                 # for self-testing only.
                 assert d_scaled.min() >= 0.0
@@ -1210,12 +1264,12 @@ class DoubleSwishFunction(torch.autograd.Function):
 
     @staticmethod
     def backward(ctx, y_grad: Tensor) -> Tensor:
-        d, = ctx.saved_tensors
+        (d,) = ctx.saved_tensors
         # the same constants as used in forward pass.
         floor = -0.043637
         ceil = 1.2
 
-        d = (d * ((ceil - floor) / 255.0) + floor)
+        d = d * ((ceil - floor) / 255.0) + floor
         return y_grad * d
 
 
@@ -1239,9 +1293,7 @@ class Dropout2(nn.Module):
         self.p = p
 
     def forward(self, x: Tensor) -> Tensor:
-        return torch.nn.functional.dropout(x,
-                                           p=float(self.p),
-                                           training=self.training)
+        return torch.nn.functional.dropout(x, p=float(self.p), training=self.training)
 
 
 class MulForDropout3(torch.autograd.Function):
@@ -1259,7 +1311,7 @@ class MulForDropout3(torch.autograd.Function):
     @staticmethod
     @custom_bwd
     def backward(ctx, ans_grad):
-        ans, = ctx.saved_tensors
+        (ans,) = ctx.saved_tensors
         x_grad = ctx.alpha * ans_grad * (ans != 0)
         return x_grad, None, None
 
@@ -1286,7 +1338,7 @@ class Dropout3(nn.Module):
 
 class SwooshLFunction(torch.autograd.Function):
     """
-      swoosh(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
+    swoosh(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
     """
 
     @staticmethod
@@ -1308,13 +1360,15 @@ class SwooshLFunction(torch.autograd.Function):
                 if not requires_grad:
                     return y
 
-                y.backward(gradient = torch.ones_like(y))
+                y.backward(gradient=torch.ones_like(y))
 
                 grad = x.grad
                 floor = coeff
                 ceil = 1.0 + coeff + 0.005
 
-                d_scaled = ((grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(grad))
+                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
+                    grad
+                )
                 if __name__ == "__main__":
                     # for self-testing only.
                     assert d_scaled.min() >= 0.0
@@ -1328,20 +1382,19 @@ class SwooshLFunction(torch.autograd.Function):
 
     @staticmethod
     def backward(ctx, y_grad: Tensor) -> Tensor:
-        d, = ctx.saved_tensors
+        (d,) = ctx.saved_tensors
         # the same constants as used in forward pass.
 
         coeff = -0.08
         floor = coeff
         ceil = 1.0 + coeff + 0.005
-        d = (d * ((ceil - floor) / 255.0) + floor)
-        return (y_grad * d)
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
 
 
 class SwooshL(torch.nn.Module):
     def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-L activation.
-        """
+        """Return Swoosh-L activation."""
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
             return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035
@@ -1351,19 +1404,19 @@ class SwooshL(torch.nn.Module):
             return k2.swoosh_l(x)
         # return SwooshLFunction.apply(x)
 
+
 class SwooshLOnnx(torch.nn.Module):
     def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-L activation.
-        """
+        """Return Swoosh-L activation."""
         zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
         return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035
 
 
 class SwooshRFunction(torch.autograd.Function):
     """
-      swoosh(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
+     swoosh(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
 
-     derivatives are between -0.08 and 0.92.
+    derivatives are between -0.08 and 0.92.
     """
 
     @staticmethod
@@ -1379,17 +1432,19 @@ class SwooshRFunction(torch.autograd.Function):
             with torch.enable_grad():
                 x = x.detach()
                 x.requires_grad = True
-                y = torch.logaddexp(zero, x - 1.) - 0.08 * x - 0.313261687
+                y = torch.logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
 
                 if not requires_grad:
                     return y
-                y.backward(gradient = torch.ones_like(y))
+                y.backward(gradient=torch.ones_like(y))
 
                 grad = x.grad
                 floor = -0.08
                 ceil = 0.925
 
-                d_scaled = ((grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(grad))
+                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
+                    grad
+                )
                 if __name__ == "__main__":
                     # for self-testing only.
                     assert d_scaled.min() >= 0.0
@@ -1403,33 +1458,32 @@ class SwooshRFunction(torch.autograd.Function):
 
     @staticmethod
     def backward(ctx, y_grad: Tensor) -> Tensor:
-        d, = ctx.saved_tensors
+        (d,) = ctx.saved_tensors
         # the same constants as used in forward pass.
         floor = -0.08
         ceil = 0.925
-        d = (d * ((ceil - floor) / 255.0) + floor)
-        return (y_grad * d)
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
 
 
 class SwooshR(torch.nn.Module):
     def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-R activation.
-        """
+        """Return Swoosh-R activation."""
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-            return logaddexp(zero, x - 1.) - 0.08 * x - 0.313261687
+            return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
         if not x.requires_grad:
             return k2.swoosh_r_forward(x)
         else:
             return k2.swoosh_r(x)
         # return SwooshRFunction.apply(x)
 
+
 class SwooshROnnx(torch.nn.Module):
     def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-R activation.
-        """
+        """Return Swoosh-R activation."""
         zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-        return logaddexp_onnx(zero, x - 1.) - 0.08 * x - 0.313261687
+        return logaddexp_onnx(zero, x - 1.0) - 0.08 * x - 0.313261687
 
 
 # simple version of SwooshL that does not redefine the backprop, used in
@@ -1437,7 +1491,7 @@ class SwooshROnnx(torch.nn.Module):
 def SwooshLForward(x: Tensor):
     x_offset = x - 4.0
     log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
-    log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum)
+    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
     return log_sum - 0.08 * x - 0.035
 
 
@@ -1446,28 +1500,30 @@ def SwooshLForward(x: Tensor):
 def SwooshRForward(x: Tensor):
     x_offset = x - 1.0
     log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
-    log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum)
+    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
     return log_sum - 0.08 * x - 0.313261687
 
 
 class ActivationDropoutAndLinearFunction(torch.autograd.Function):
     @staticmethod
     @custom_fwd
-    def forward(ctx,
-                x: Tensor,
-                weight: Tensor,
-                bias: Optional[Tensor],
-                activation: str,
-                dropout_p: float,
-                dropout_shared_dim: Optional[int]):
+    def forward(
+        ctx,
+        x: Tensor,
+        weight: Tensor,
+        bias: Optional[Tensor],
+        activation: str,
+        dropout_p: float,
+        dropout_shared_dim: Optional[int],
+    ):
         if dropout_p != 0.0:
             dropout_shape = list(x.shape)
             if dropout_shared_dim is not None:
                 dropout_shape[dropout_shared_dim] = 1
             # else it won't be very memory efficient.
-            dropout_mask = ((1.0 / (1.0 - dropout_p)) *
-                            (torch.rand(*dropout_shape,
-                                        device=x.device, dtype=x.dtype) > dropout_p))
+            dropout_mask = (1.0 / (1.0 - dropout_p)) * (
+                torch.rand(*dropout_shape, device=x.device, dtype=x.dtype) > dropout_p
+            )
         else:
             dropout_mask = None
 
@@ -1476,8 +1532,8 @@ class ActivationDropoutAndLinearFunction(torch.autograd.Function):
         ctx.activation = activation
 
         forward_activation_dict = {
-            'SwooshL': k2.swoosh_l_forward,
-            'SwooshR': k2.swoosh_r_forward
+            "SwooshL": k2.swoosh_l_forward,
+            "SwooshR": k2.swoosh_r_forward,
         }
         # it will raise a KeyError if this fails.  This will be an error.  We let it
         # propagate to the user.
@@ -1495,8 +1551,8 @@ class ActivationDropoutAndLinearFunction(torch.autograd.Function):
         (x, weight, bias, dropout_mask) = saved
 
         forward_and_deriv_activation_dict = {
-            'SwooshL': k2.swoosh_l_forward_and_deriv,
-            'SwooshR': k2.swoosh_r_forward_and_deriv
+            "SwooshL": k2.swoosh_l_forward_and_deriv,
+            "SwooshR": k2.swoosh_r_forward_and_deriv,
         }
         # the following lines a KeyError if the activation is unrecognized.
         # This will be an error.  We let it propagate to the user.
@@ -1511,8 +1567,7 @@ class ActivationDropoutAndLinearFunction(torch.autograd.Function):
 
         in_channels = y.shape[-1]
         g = ans_grad.reshape(-1, out_channels)
-        weight_deriv = torch.matmul(g.t(),
-                                    y.reshape(-1, in_channels))
+        weight_deriv = torch.matmul(g.t(), y.reshape(-1, in_channels))
         y_deriv = torch.matmul(ans_grad, weight)
         bias_deriv = None if bias is None else g.sum(dim=0)
         x_deriv = y_deriv * func_deriv
@@ -1525,71 +1580,76 @@ class ActivationDropoutAndLinearFunction(torch.autograd.Function):
 
 class ActivationDropoutAndLinear(torch.nn.Module):
     """
-    This merges an activation function followed by dropout and then a nn.Linear module;
-    it does so in a memory efficient way so that it only stores the input to the whole
-    module.  If activation == SwooshL and dropout_shared_dim != None, this will be
-    equivalent to:
-      nn.Sequential(SwooshL(),
-                    Dropout3(dropout_p, shared_dim=dropout_shared_dim),
-                    ScaledLinear(in_channels, out_channels, bias=bias,
-                                 initial_scale=initial_scale))
-   If dropout_shared_dim is None, the dropout would be equivalent to
-   Dropout2(dropout_p).  Note: Dropout3 will be more memory efficient as the dropout
-   mask is smaller.
+     This merges an activation function followed by dropout and then a nn.Linear module;
+     it does so in a memory efficient way so that it only stores the input to the whole
+     module.  If activation == SwooshL and dropout_shared_dim != None, this will be
+     equivalent to:
+       nn.Sequential(SwooshL(),
+                     Dropout3(dropout_p, shared_dim=dropout_shared_dim),
+                     ScaledLinear(in_channels, out_channels, bias=bias,
+                                  initial_scale=initial_scale))
+    If dropout_shared_dim is None, the dropout would be equivalent to
+    Dropout2(dropout_p).  Note: Dropout3 will be more memory efficient as the dropout
+    mask is smaller.
 
-    Args:
-       in_channels: number of input channels, e.g. 256
-       out_channels: number of output channels, e.g. 256
-       bias: if true, have a bias
-       activation: the activation function, for now just support SwooshL.
-       dropout_p: the dropout probability or schedule (happens after nonlinearity).
-       dropout_shared_dim: the dimension, if any, across which the dropout mask is
-            shared (e.g. the time dimension).  If None, this may be less memory
-            efficient if there are modules before this one that cache the input
-            for their backprop (e.g. Balancer or Whiten).
+     Args:
+        in_channels: number of input channels, e.g. 256
+        out_channels: number of output channels, e.g. 256
+        bias: if true, have a bias
+        activation: the activation function, for now just support SwooshL.
+        dropout_p: the dropout probability or schedule (happens after nonlinearity).
+        dropout_shared_dim: the dimension, if any, across which the dropout mask is
+             shared (e.g. the time dimension).  If None, this may be less memory
+             efficient if there are modules before this one that cache the input
+             for their backprop (e.g. Balancer or Whiten).
     """
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 bias: bool = True,
-                 activation: str = 'SwooshL',
-                 dropout_p: FloatLike = 0.0,
-                 dropout_shared_dim: Optional[int] = -1,
-                 initial_scale: float = 1.0):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        bias: bool = True,
+        activation: str = "SwooshL",
+        dropout_p: FloatLike = 0.0,
+        dropout_shared_dim: Optional[int] = -1,
+        initial_scale: float = 1.0,
+    ):
         super().__init__()
         # create a temporary module of nn.Linear that we'll steal the
         # weights and bias from
-        l = ScaledLinear(in_channels, out_channels,
-                         bias=bias,
-                         initial_scale=initial_scale)
+        l = ScaledLinear(
+            in_channels, out_channels, bias=bias, initial_scale=initial_scale
+        )
 
         self.weight = l.weight
         # register_parameter properly handles making it a parameter when l.bias
         # is None. I think there is some reason for doing it this way rather
         # than just setting it to None but I don't know what it is, maybe
         # something to do with exporting the module..
-        self.register_parameter('bias', l.bias)
+        self.register_parameter("bias", l.bias)
 
         self.activation = activation
         self.dropout_p = dropout_p
         self.dropout_shared_dim = dropout_shared_dim
 
-    def forward(self,
-                x: Tensor):
+    def forward(self, x: Tensor):
         if torch.jit.is_scripting() or torch.jit.is_tracing():
-            if self.activation == 'SwooshL':
+            if self.activation == "SwooshL":
                 x = SwooshLForward(x)
             elif self.activation == "SwooshR":
                 x = SwooshRForward(x)
             else:
                 assert False, self.activation
-            return torch.nn.functional.linear(x,
-                                              self.weight,
-                                              self.bias)
+            return torch.nn.functional.linear(x, self.weight, self.bias)
 
         return ActivationDropoutAndLinearFunction.apply(
-            x, self.weight, self.bias, self.activation,
-            float(self.dropout_p), self.dropout_shared_dim)
+            x,
+            self.weight,
+            self.bias,
+            self.activation,
+            float(self.dropout_p),
+            self.dropout_shared_dim,
+        )
 
 
 def convert_num_channels(x: Tensor, num_channels: int) -> Tensor:
@@ -1612,10 +1672,9 @@ def _test_whiten():
 
         x.requires_grad = True
 
-        m = Whiten(1,  # num_groups
-                   5.0,  # whitening_limit,
-                   prob=1.0,
-                   grad_scale=0.1)  # grad_scale
+        m = Whiten(
+            1, 5.0, prob=1.0, grad_scale=0.1  # num_groups  # whitening_limit,
+        )  # grad_scale
 
         for _ in range(4):
             y = m(x)
@@ -1656,9 +1715,7 @@ def _test_balancer_sign():
 def _test_balancer_magnitude():
     magnitudes = torch.arange(0, 1, 0.01)
     N = 1000
-    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(
-        -1
-    )
+    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(-1)
     x = x.detach()
     x.requires_grad = True
     m = Balancer(
@@ -1685,7 +1742,7 @@ def _test_double_swish_deriv():
     x.requires_grad = True
     m = DoubleSwish()
 
-    tol = ((1.2-(-0.043637))/255.0)
+    tol = (1.2 - (-0.043637)) / 255.0
     torch.autograd.gradcheck(m, x, atol=tol)
 
     # for self-test.
@@ -1699,7 +1756,7 @@ def _test_swooshl_deriv():
     x.requires_grad = True
     m = SwooshL()
 
-    tol = (1.0 / 255.0)
+    tol = 1.0 / 255.0
     torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
 
     # for self-test.
@@ -1713,7 +1770,7 @@ def _test_swooshr_deriv():
     x.requires_grad = True
     m = SwooshR()
 
-    tol = (1.0 / 255.0)
+    tol = 1.0 / 255.0
     torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
 
     # for self-test.
@@ -1727,24 +1784,24 @@ def _test_softmax():
     b = a.clone()
     a.requires_grad = True
     b.requires_grad = True
-    a.softmax(dim=1)[:,0].sum().backward()
+    a.softmax(dim=1)[:, 0].sum().backward()
     print("a grad = ", a.grad)
-    softmax(b, dim=1)[:,0].sum().backward()
+    softmax(b, dim=1)[:, 0].sum().backward()
     print("b grad = ", b.grad)
     assert torch.allclose(a.grad, b.grad)
 
 
 def _test_piecewise_linear():
-    p = PiecewiseLinear( (0, 10.0) )
+    p = PiecewiseLinear((0, 10.0))
     for x in [-100, 0, 100]:
         assert p(x) == 10.0
-    p = PiecewiseLinear( (0, 10.0), (1, 0.0) )
-    for x, y in [ (-100, 10.0), (0, 10.0), (0.5, 5.0), (1, 0.0), (2, 0.0) ]:
+    p = PiecewiseLinear((0, 10.0), (1, 0.0))
+    for x, y in [(-100, 10.0), (0, 10.0), (0.5, 5.0), (1, 0.0), (2, 0.0)]:
         print("x, y = ", x, y)
         assert p(x) == y, (x, p(x), y)
 
     q = PiecewiseLinear((0.5, 15.0), (0.6, 1.0))
-    x_vals = [ -1.0, 0.0, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.0, 2.0 ]
+    x_vals = [-1.0, 0.0, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.0, 2.0]
     pq = p.max(q)
     for x in x_vals:
         y1 = max(p(x), q(x))
@@ -1757,7 +1814,7 @@ def _test_piecewise_linear():
         assert abs(y1 - y2) < 0.001
     pq = p + q
     for x in x_vals:
-        y1 = p(x) +  q(x)
+        y1 = p(x) + q(x)
         y2 = pq(x)
         assert abs(y1 - y2) < 0.001
 
@@ -1772,15 +1829,22 @@ def _test_activation_dropout_and_linear():
         # swoosh_l an swoosh_r inside SwooshL() and SwooshR(), and they call randn()
         # internally, messing up the random state.
         for dropout_p in [0.0]:
-            for activation in ['SwooshL', 'SwooshR']:
-                m1 =  nn.Sequential(SwooshL() if activation == 'SwooshL' else SwooshR(),
-                                    Dropout3(p=dropout_p, shared_dim=-1),
-                                    ScaledLinear(in_channels, out_channels, bias=bias,
-                                                 initial_scale=0.5))
-                m2 = ActivationDropoutAndLinear(in_channels, out_channels,
-                                                bias=bias, initial_scale=0.5,
-                                                activation=activation,
-                                                dropout_p=dropout_p)
+            for activation in ["SwooshL", "SwooshR"]:
+                m1 = nn.Sequential(
+                    SwooshL() if activation == "SwooshL" else SwooshR(),
+                    Dropout3(p=dropout_p, shared_dim=-1),
+                    ScaledLinear(
+                        in_channels, out_channels, bias=bias, initial_scale=0.5
+                    ),
+                )
+                m2 = ActivationDropoutAndLinear(
+                    in_channels,
+                    out_channels,
+                    bias=bias,
+                    initial_scale=0.5,
+                    activation=activation,
+                    dropout_p=dropout_p,
+                )
                 with torch.no_grad():
                     m2.weight[:] = m1[2].weight
                     if bias:
@@ -1790,9 +1854,9 @@ def _test_activation_dropout_and_linear():
                 x1.requires_grad = True
 
                 # TEMP.
-                assert torch.allclose(SwooshRFunction.apply(x1),
-                                      SwooshRForward(x1),
-                                      atol=1.0e-03)
+                assert torch.allclose(
+                    SwooshRFunction.apply(x1), SwooshRForward(x1), atol=1.0e-03
+                )
 
                 x2 = x1.clone().detach()
                 x2.requires_grad = True
@@ -1805,21 +1869,24 @@ def _test_activation_dropout_and_linear():
                 y2 = m2(x2)
                 y2.backward(gradient=y_grad)
 
-                print(f"bias = {bias}, dropout_p = {dropout_p}, activation = {activation}")
+                print(
+                    f"bias = {bias}, dropout_p = {dropout_p}, activation = {activation}"
+                )
                 print("y1 = ", y1)
                 print("y2 = ", y2)
                 assert torch.allclose(y1, y2, atol=0.02)
-                assert torch.allclose(m1[2].weight.grad, m2.weight.grad,
-                                      atol=1.0e-05)
+                assert torch.allclose(m1[2].weight.grad, m2.weight.grad, atol=1.0e-05)
                 if bias:
-                    assert torch.allclose(m1[2].bias.grad, m2.bias.grad,
-                                          atol=1.0e-05)
+                    assert torch.allclose(m1[2].bias.grad, m2.bias.grad, atol=1.0e-05)
                 print("x1.grad = ", x1.grad)
                 print("x2.grad = ", x2.grad)
 
                 def isclose(a, b):
                     # return true if cosine similarity is > 0.9.
-                    return (a * b).sum() > 0.9 * ((a**2).sum() * (b**2).sum()).sqrt()
+                    return (a * b).sum() > 0.9 * (
+                        (a**2).sum() * (b**2).sum()
+                    ).sqrt()
+
                 # the SwooshL() implementation has a noisy gradient due to 1-byte
                 # storage of it.
                 assert isclose(x1.grad, x2.grad)
diff --git a/egs/multi_zh-hans/ASR/zipformer/streaming_decode.py b/egs/multi_zh-hans/ASR/zipformer/streaming_decode.py
index 1dcd74cb2..d00aaae92 100755
--- a/egs/multi_zh-hans/ASR/zipformer/streaming_decode.py
+++ b/egs/multi_zh-hans/ASR/zipformer/streaming_decode.py
@@ -374,11 +374,7 @@ def streaming_forward(
     Returns encoder outputs, output lengths, and updated states.
     """
     cached_embed_left_pad = states[-2]
-    (
-        x,
-        x_lens,
-        new_cached_embed_left_pad,
-    ) = model.encoder_embed.streaming_forward(
+    (x, x_lens, new_cached_embed_left_pad,) = model.encoder_embed.streaming_forward(
         x=features,
         x_lens=feature_lens,
         cached_left_pad=cached_embed_left_pad,
diff --git a/egs/multi_zh-hans/ASR/zipformer/subsampling.py b/egs/multi_zh-hans/ASR/zipformer/subsampling.py
index d6bf57db4..39446ed35 100644
--- a/egs/multi_zh-hans/ASR/zipformer/subsampling.py
+++ b/egs/multi_zh-hans/ASR/zipformer/subsampling.py
@@ -107,9 +107,7 @@ class ConvNeXt(nn.Module):
         if layerdrop_rate != 0.0:
             batch_size = x.shape[0]
             mask = (
-                torch.rand(
-                    (batch_size, 1, 1, 1), dtype=x.dtype, device=x.device
-                )
+                torch.rand((batch_size, 1, 1, 1), dtype=x.dtype, device=x.device)
                 > layerdrop_rate
             )
         else:
@@ -275,9 +273,7 @@ class Conv2dSubsampling(nn.Module):
         # many copies of this extra gradient term.
         self.out_whiten = Whiten(
             num_groups=1,
-            whitening_limit=ScheduledFloat(
-                (0.0, 4.0), (20000.0, 8.0), default=4.0
-            ),
+            whitening_limit=ScheduledFloat((0.0, 4.0), (20000.0, 8.0), default=4.0),
             prob=(0.025, 0.25),
             grad_scale=0.02,
         )
@@ -400,8 +396,8 @@ class Conv2dSubsampling(nn.Module):
         left_pad = self.convnext.padding[0]
         freq = self.out_width
         channels = self.layer3_channels
-        cached_embed_left_pad = torch.zeros(
-            batch_size, channels, left_pad, freq
-        ).to(device)
+        cached_embed_left_pad = torch.zeros(batch_size, channels, left_pad, freq).to(
+            device
+        )
 
         return cached_embed_left_pad
diff --git a/egs/multi_zh-hans/ASR/zipformer/zipformer.py b/egs/multi_zh-hans/ASR/zipformer/zipformer.py
index 7d98dbeb1..5a83f9a2b 100644
--- a/egs/multi_zh-hans/ASR/zipformer/zipformer.py
+++ b/egs/multi_zh-hans/ASR/zipformer/zipformer.py
@@ -91,34 +91,34 @@ class Zipformer2(EncoderInterface):
            chunks.  Must not be less than cnn_module_kernel (after factoring in
            rounding and downsampling); an error will be thrown if this is violated.
     """
+
     def __init__(
-            self,
-            output_downsampling_factor: int = 2,
-            downsampling_factor: Tuple[int] = (2, 4),
-            encoder_dim: Union[int, Tuple[int]] = 384,
-            num_encoder_layers: Union[int, Tuple[int]] = 4,
-            encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
-            query_head_dim: Union[int, Tuple[int]]  = 24,
-            pos_head_dim: Union[int, Tuple[int]]  = 4,
-            value_head_dim: Union[int, Tuple[int]] = 12,
-            num_heads: Union[int, Tuple[int]] = 8,
-            feedforward_dim: Union[int, Tuple[int]] = 1536,
-            cnn_module_kernel: Union[int, Tuple[int]] = 31,
-            pos_dim: int = 192,
-            dropout: FloatLike = None,  # see code below for default
-            warmup_batches: float = 4000.0,
-            causal: bool = False,
-            chunk_size: Tuple[int] = [-1],
-            left_context_frames: Tuple[int] = [-1],
+        self,
+        output_downsampling_factor: int = 2,
+        downsampling_factor: Tuple[int] = (2, 4),
+        encoder_dim: Union[int, Tuple[int]] = 384,
+        num_encoder_layers: Union[int, Tuple[int]] = 4,
+        encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
+        query_head_dim: Union[int, Tuple[int]] = 24,
+        pos_head_dim: Union[int, Tuple[int]] = 4,
+        value_head_dim: Union[int, Tuple[int]] = 12,
+        num_heads: Union[int, Tuple[int]] = 8,
+        feedforward_dim: Union[int, Tuple[int]] = 1536,
+        cnn_module_kernel: Union[int, Tuple[int]] = 31,
+        pos_dim: int = 192,
+        dropout: FloatLike = None,  # see code below for default
+        warmup_batches: float = 4000.0,
+        causal: bool = False,
+        chunk_size: Tuple[int] = [-1],
+        left_context_frames: Tuple[int] = [-1],
     ) -> None:
         super(Zipformer2, self).__init__()
 
         if dropout is None:
-            dropout = ScheduledFloat((0.0, 0.3),
-                                     (20000.0, 0.1))
+            dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
 
         def _to_tuple(x):
-            """ Converts a single int or a 1-tuple of an int to a tuple with the same length
+            """Converts a single int or a 1-tuple of an int to a tuple with the same length
             as downsampling_factor"""
             if isinstance(x, int):
                 x = (x,)
@@ -128,10 +128,12 @@ class Zipformer2(EncoderInterface):
                 assert len(x) == len(downsampling_factor) and isinstance(x[0], int)
             return x
 
-        self.output_downsampling_factor = output_downsampling_factor # int
-        self.downsampling_factor = downsampling_factor # tuple
-        self.encoder_dim = encoder_dim = _to_tuple(encoder_dim) # tuple
-        self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(encoder_unmasked_dim) # tuple
+        self.output_downsampling_factor = output_downsampling_factor  # int
+        self.downsampling_factor = downsampling_factor  # tuple
+        self.encoder_dim = encoder_dim = _to_tuple(encoder_dim)  # tuple
+        self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(
+            encoder_unmasked_dim
+        )  # tuple
         num_encoder_layers = _to_tuple(num_encoder_layers)
         self.num_encoder_layers = num_encoder_layers
         self.query_head_dim = query_head_dim = _to_tuple(query_head_dim)
@@ -145,7 +147,7 @@ class Zipformer2(EncoderInterface):
         self.chunk_size = chunk_size
         self.left_context_frames = left_context_frames
 
-        for u,d in zip(encoder_unmasked_dim, encoder_dim):
+        for u, d in zip(encoder_unmasked_dim, encoder_dim):
             assert u <= d
 
         # each one will be Zipformer2Encoder or DownsampledZipformer2Encoder
@@ -191,13 +193,11 @@ class Zipformer2(EncoderInterface):
 
         self.encoders = nn.ModuleList(encoders)
 
-        self.downsample_output = SimpleDownsample(max(encoder_dim),
-                                                  downsample=output_downsampling_factor,
-                                                  dropout=dropout)
+        self.downsample_output = SimpleDownsample(
+            max(encoder_dim), downsample=output_downsampling_factor, dropout=dropout
+        )
 
-    def get_feature_masks(
-            self,
-            x: Tensor) -> Union[List[float], List[Tensor]]:
+    def get_feature_masks(self, x: Tensor) -> Union[List[float], List[Tensor]]:
         """
         In eval mode, returns [1.0] * num_encoders; in training mode, returns a number of
         randomized feature masks, one per encoder.
@@ -215,7 +215,7 @@ class Zipformer2(EncoderInterface):
         """
         num_encoders = len(self.encoder_dim)
         if not self.training:
-            return [ 1.0 ] * num_encoders
+            return [1.0] * num_encoders
 
         (num_frames0, batch_size, _encoder_dims0) = x.shape
 
@@ -224,15 +224,18 @@ class Zipformer2(EncoderInterface):
         feature_mask_dropout_prob = 0.125
 
         # mask1 shape: (1, batch_size, 1)
-        mask1 = (torch.rand(1, batch_size, 1,
-                            device=x.device) >
-                 feature_mask_dropout_prob).to(x.dtype)
+        mask1 = (
+            torch.rand(1, batch_size, 1, device=x.device) > feature_mask_dropout_prob
+        ).to(x.dtype)
 
         # mask2 has additional sequences masked, about twice the number.
-        mask2 = torch.logical_and(mask1,
-                                  (torch.rand(1, batch_size, 1,
-                                              device=x.device) >
-                                   feature_mask_dropout_prob).to(x.dtype))
+        mask2 = torch.logical_and(
+            mask1,
+            (
+                torch.rand(1, batch_size, 1, device=x.device)
+                > feature_mask_dropout_prob
+            ).to(x.dtype),
+        )
 
         # dim: (1, batch_size, 2)
         mask = torch.cat((mask1, mask2), dim=-1)
@@ -240,8 +243,9 @@ class Zipformer2(EncoderInterface):
         feature_masks = []
         for i in range(num_encoders):
             channels = self.encoder_dim[i]
-            feature_mask = torch.ones(1, batch_size, channels,
-                                      dtype=x.dtype, device=x.device)
+            feature_mask = torch.ones(
+                1, batch_size, channels, dtype=x.dtype, device=x.device
+            )
             u1 = self.encoder_unmasked_dim[i]
             u2 = u1 + (channels - u1) // 2
 
@@ -281,7 +285,8 @@ class Zipformer2(EncoderInterface):
         return chunk_size, left_context_chunks
 
     def forward(
-        self, x: Tensor,
+        self,
+        x: Tensor,
         x_lens: Tensor,
         src_key_padding_mask: Optional[Tensor] = None,
     ) -> Tuple[Tensor, Tensor]:
@@ -319,12 +324,17 @@ class Zipformer2(EncoderInterface):
             ds = self.downsampling_factor[i]
             x = convert_num_channels(x, self.encoder_dim[i])
 
-            x = module(x,
-                       chunk_size=chunk_size,
-                       feature_mask=feature_masks[i],
-                       src_key_padding_mask=(None if src_key_padding_mask is None
-                                             else src_key_padding_mask[...,::ds]),
-                       attn_mask=attn_mask)
+            x = module(
+                x,
+                chunk_size=chunk_size,
+                feature_mask=feature_masks[i],
+                src_key_padding_mask=(
+                    None
+                    if src_key_padding_mask is None
+                    else src_key_padding_mask[..., ::ds]
+                ),
+                attn_mask=attn_mask,
+            )
             outputs.append(x)
 
         # if the last output has the largest dimension, x will be unchanged,
@@ -345,9 +355,7 @@ class Zipformer2(EncoderInterface):
         return x, lengths
 
     def _get_attn_mask(
-        self, x: Tensor,
-        chunk_size: int,
-        left_context_chunks: int
+        self, x: Tensor, chunk_size: int, left_context_chunks: int
     ) -> Optional[Tensor]:
         """
         Return None if chunk_size == -1, else return attention mask of shape
@@ -362,9 +370,11 @@ class Zipformer2(EncoderInterface):
         assert all(chunk_size % d == 0 for d in self.downsampling_factor)
         if left_context_chunks >= 0:
             num_encoders = len(self.encoder_dim)
-            assert all (chunk_size * left_context_chunks >=
-                        (self.cnn_module_kernel[i] // 2) * self.downsampling_factor[i]
-                        for i in range(num_encoders))
+            assert all(
+                chunk_size * left_context_chunks
+                >= (self.cnn_module_kernel[i] // 2) * self.downsampling_factor[i]
+                for i in range(num_encoders)
+            )
         else:
             left_context_chunks = 1000000
 
@@ -382,8 +392,7 @@ class Zipformer2(EncoderInterface):
         src_c = c
         tgt_c = c.unsqueeze(-1)
 
-        attn_mask = torch.logical_or(src_c > tgt_c,
-                                     src_c < tgt_c - left_context_chunks)
+        attn_mask = torch.logical_or(src_c > tgt_c, src_c < tgt_c - left_context_chunks)
         if __name__ == "__main__":
             logging.info(f"attn_mask = {attn_mask}")
         return attn_mask
@@ -392,7 +401,7 @@ class Zipformer2(EncoderInterface):
         num_encoders = len(self.encoder_dim)
         assert len(outputs) == num_encoders
         output_dim = max(self.encoder_dim)
-        output_pieces = [ outputs[-1] ]
+        output_pieces = [outputs[-1]]
         cur_dim = self.encoder_dim[-1]
         for i in range(num_encoders - 2, -1, -1):
             d = self.encoder_dim[i]
@@ -489,21 +498,38 @@ class Zipformer2(EncoderInterface):
             nonlin_attn_head_dim = 3 * embed_dim // 4
             conv_left_pad = self.cnn_module_kernel[i] // 2
             for layer in range(num_layers):
-                cached_key = torch.zeros(downsample_left, batch_size, key_dim).to(device)
-                cached_nonlin_attn = torch.zeros(1, batch_size, downsample_left, nonlin_attn_head_dim).to(device)
-                cached_val1 = torch.zeros(downsample_left, batch_size, value_dim).to(device)
-                cached_val2 = torch.zeros(downsample_left, batch_size, value_dim).to(device)
-                cached_conv1 = torch.zeros(batch_size, embed_dim, conv_left_pad).to(device)
-                cached_conv2 = torch.zeros(batch_size, embed_dim, conv_left_pad).to(device)
-                states += [cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2]
+                cached_key = torch.zeros(downsample_left, batch_size, key_dim).to(
+                    device
+                )
+                cached_nonlin_attn = torch.zeros(
+                    1, batch_size, downsample_left, nonlin_attn_head_dim
+                ).to(device)
+                cached_val1 = torch.zeros(downsample_left, batch_size, value_dim).to(
+                    device
+                )
+                cached_val2 = torch.zeros(downsample_left, batch_size, value_dim).to(
+                    device
+                )
+                cached_conv1 = torch.zeros(batch_size, embed_dim, conv_left_pad).to(
+                    device
+                )
+                cached_conv2 = torch.zeros(batch_size, embed_dim, conv_left_pad).to(
+                    device
+                )
+                states += [
+                    cached_key,
+                    cached_nonlin_attn,
+                    cached_val1,
+                    cached_val2,
+                    cached_conv1,
+                    cached_conv2,
+                ]
 
         return states
 
 
 def _whitening_schedule(x: float, ratio: float = 2.0) -> ScheduledFloat:
-    return ScheduledFloat((0.0, x),
-                          (20000.0, ratio * x),
-                          default=x)
+    return ScheduledFloat((0.0, x), (20000.0, ratio * x), default=x)
 
 
 def _balancer_schedule(min_prob: float):
@@ -525,31 +551,45 @@ class Zipformer2EncoderLayer(nn.Module):
         >>> pos_emb = torch.rand(32, 19, 512)
         >>> out = encoder_layer(src, pos_emb)
     """
+
     def __init__(
-            self,
-            embed_dim: int,
-            pos_dim: int,
-            num_heads: int,
-            query_head_dim: int,
-            pos_head_dim: int,
-            value_head_dim: int,
-            feedforward_dim: int,
-            dropout: FloatLike = 0.1,
-            cnn_module_kernel: int = 31,
-            causal: bool = False,
-            attention_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0),
-            conv_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0),
-            const_attention_rate: FloatLike = ScheduledFloat((0.0, 0.25), (4000.0, 0.025), default=0),
-            ff2_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)),
-            ff3_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)),
-            bypass_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), (4000.0, 0.02), default=0),
+        self,
+        embed_dim: int,
+        pos_dim: int,
+        num_heads: int,
+        query_head_dim: int,
+        pos_head_dim: int,
+        value_head_dim: int,
+        feedforward_dim: int,
+        dropout: FloatLike = 0.1,
+        cnn_module_kernel: int = 31,
+        causal: bool = False,
+        attention_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0
+        ),
+        conv_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0
+        ),
+        const_attention_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.25), (4000.0, 0.025), default=0
+        ),
+        ff2_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)
+        ),
+        ff3_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)
+        ),
+        bypass_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.5), (4000.0, 0.02), default=0
+        ),
     ) -> None:
         super(Zipformer2EncoderLayer, self).__init__()
         self.embed_dim = embed_dim
 
         # self.bypass implements layer skipping as well as bypass; see its default values.
-        self.bypass = BypassModule(embed_dim, skip_rate=bypass_skip_rate,
-                                   straight_through_rate=0)
+        self.bypass = BypassModule(
+            embed_dim, skip_rate=bypass_skip_rate, straight_through_rate=0
+        )
         # bypass_mid is bypass used in the middle of the layer.
         self.bypass_mid = BypassModule(embed_dim, straight_through_rate=0)
 
@@ -567,39 +607,39 @@ class Zipformer2EncoderLayer(nn.Module):
         self.const_attention_rate = copy.deepcopy(const_attention_rate)
 
         self.self_attn_weights = RelPositionMultiheadAttentionWeights(
-            embed_dim, pos_dim=pos_dim, num_heads=num_heads,
-            query_head_dim=query_head_dim, pos_head_dim=pos_head_dim,
+            embed_dim,
+            pos_dim=pos_dim,
+            num_heads=num_heads,
+            query_head_dim=query_head_dim,
+            pos_head_dim=pos_head_dim,
             dropout=0.0,
         )
 
-        self.self_attn1 = SelfAttention(embed_dim, num_heads,
-                                        value_head_dim)
+        self.self_attn1 = SelfAttention(embed_dim, num_heads, value_head_dim)
 
-        self.self_attn2 = SelfAttention(embed_dim, num_heads,
-                                        value_head_dim)
+        self.self_attn2 = SelfAttention(embed_dim, num_heads, value_head_dim)
 
-        self.feed_forward1 = FeedforwardModule(embed_dim,
-                                               (feedforward_dim * 3) // 4,
-                                               dropout)
+        self.feed_forward1 = FeedforwardModule(
+            embed_dim, (feedforward_dim * 3) // 4, dropout
+        )
 
-        self.feed_forward2 = FeedforwardModule(embed_dim,
-                                               feedforward_dim,
-                                               dropout)
+        self.feed_forward2 = FeedforwardModule(embed_dim, feedforward_dim, dropout)
 
-        self.feed_forward3 = FeedforwardModule(embed_dim,
-                                               (feedforward_dim * 5) // 4,
-                                               dropout)
+        self.feed_forward3 = FeedforwardModule(
+            embed_dim, (feedforward_dim * 5) // 4, dropout
+        )
 
-        self.nonlin_attention = NonlinAttention(embed_dim,
-                                                hidden_channels=3 * embed_dim // 4)
+        self.nonlin_attention = NonlinAttention(
+            embed_dim, hidden_channels=3 * embed_dim // 4
+        )
 
-        self.conv_module1 = ConvolutionModule(embed_dim,
-                                              cnn_module_kernel,
-                                              causal=causal)
+        self.conv_module1 = ConvolutionModule(
+            embed_dim, cnn_module_kernel, causal=causal
+        )
 
-        self.conv_module2 = ConvolutionModule(embed_dim,
-                                              cnn_module_kernel,
-                                              causal=causal)
+        self.conv_module2 = ConvolutionModule(
+            embed_dim, cnn_module_kernel, causal=causal
+        )
 
         # TODO: remove it
         self.bypass_scale = nn.Parameter(torch.full((embed_dim,), 0.5))
@@ -607,15 +647,20 @@ class Zipformer2EncoderLayer(nn.Module):
         self.norm = BiasNorm(embed_dim)
 
         self.balancer1 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.45, max_positive=0.55,
-            min_abs=0.2, max_abs=4.0,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.45,
+            max_positive=0.55,
+            min_abs=0.2,
+            max_abs=4.0,
         )
 
         # balancer for output of NonlinAttentionModule
         self.balancer_na = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.3, max_positive=0.7,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=0.7,
             min_abs=ScheduledFloat((0.0, 0.004), (4000.0, 0.02)),
             prob=0.05,  # out of concern for memory usage
         )
@@ -624,34 +669,50 @@ class Zipformer2EncoderLayer(nn.Module):
         # small.  give this a very small probability, even at the start of
         # training, it's to fix a rare problem and it's OK to fix it slowly.
         self.balancer_ff2 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.3, max_positive=0.7,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=0.7,
             min_abs=ScheduledFloat((0.0, 0.0), (4000.0, 0.1), default=0.0),
             max_abs=2.0,
             prob=0.05,
         )
 
         self.balancer_ff3 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.3, max_positive=0.7,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=0.7,
             min_abs=ScheduledFloat((0.0, 0.0), (4000.0, 0.2), default=0.0),
             max_abs=4.0,
             prob=0.05,
         )
 
-        self.whiten = Whiten(num_groups=1,
-                             whitening_limit=_whitening_schedule(4.0, ratio=3.0),
-                             prob=(0.025, 0.25),
-                             grad_scale=0.01)
-
-        self.balancer2 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.45, max_positive=0.55,
-            min_abs=0.1, max_abs=4.0,
+        self.whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(4.0, ratio=3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
         )
 
-    def get_sequence_dropout_mask(self, x: Tensor, dropout_rate: float) -> Optional[Tensor]:
-        if dropout_rate == 0.0 or not self.training or torch.jit.is_scripting() or torch.jit.is_tracing():
+        self.balancer2 = Balancer(
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.45,
+            max_positive=0.55,
+            min_abs=0.1,
+            max_abs=4.0,
+        )
+
+    def get_sequence_dropout_mask(
+        self, x: Tensor, dropout_rate: float
+    ) -> Optional[Tensor]:
+        if (
+            dropout_rate == 0.0
+            or not self.training
+            or torch.jit.is_scripting()
+            or torch.jit.is_tracing()
+        ):
             return None
         batch_size = x.shape[1]
         mask = (torch.rand(batch_size, 1, device=x.device) > dropout_rate).to(x.dtype)
@@ -677,21 +738,21 @@ class Zipformer2EncoderLayer(nn.Module):
         src_key_padding_mask: Optional[Tensor] = None,
     ) -> Tensor:
         """
-        Pass the input through the encoder layer.
-        Args:
-            src: the sequence to the encoder (required): shape (seq_len, batch_size, embedding_dim).
-         pos_emb: (1, 2*seq_len-1, pos_emb_dim) or (batch_size, 2*seq_len-1, pos_emb_dim)
-         chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking.
-       feature_mask: something that broadcasts with src, that we'll multiply `src`
-              by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
-         attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
-                interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
-               True means masked position. May be None.
-    src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
-             masked position.  May be None.
+            Pass the input through the encoder layer.
+            Args:
+                src: the sequence to the encoder (required): shape (seq_len, batch_size, embedding_dim).
+             pos_emb: (1, 2*seq_len-1, pos_emb_dim) or (batch_size, 2*seq_len-1, pos_emb_dim)
+             chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking.
+           feature_mask: something that broadcasts with src, that we'll multiply `src`
+                  by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
+             attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
+                    interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
+                   True means masked position. May be None.
+        src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
+                 masked position.  May be None.
 
-        Returns:
-           A tensor which has the same shape as src
+            Returns:
+               A tensor which has the same shape as src
         """
         src_orig = src
 
@@ -699,7 +760,9 @@ class Zipformer2EncoderLayer(nn.Module):
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             attention_skip_rate = 0.0
         else:
-            attention_skip_rate = float(self.attention_skip_rate) if self.training else 0.0
+            attention_skip_rate = (
+                float(self.attention_skip_rate) if self.training else 0.0
+            )
 
         # attn_weights: (num_heads, batch_size, seq_len, seq_len)
         attn_weights = self.self_attn_weights(
@@ -711,7 +774,9 @@ class Zipformer2EncoderLayer(nn.Module):
 
         src = src + self.feed_forward1(src)
 
-        self_attn_dropout_mask = self.get_sequence_dropout_mask(src, attention_skip_rate)
+        self_attn_dropout_mask = self.get_sequence_dropout_mask(
+            src, attention_skip_rate
+        )
 
         selected_attn_weights = attn_weights[0:1]
         if torch.jit.is_scripting() or torch.jit.is_tracing():
@@ -722,53 +787,75 @@ class Zipformer2EncoderLayer(nn.Module):
             # averaging-over-time operation.
             # only need the mask, can just use the 1st one and expand later
             selected_attn_weights = selected_attn_weights[0:1]
-            selected_attn_weights = (selected_attn_weights > 0.0).to(selected_attn_weights.dtype)
-            selected_attn_weights = selected_attn_weights * (1.0 / selected_attn_weights.sum(dim=-1, keepdim=True))
+            selected_attn_weights = (selected_attn_weights > 0.0).to(
+                selected_attn_weights.dtype
+            )
+            selected_attn_weights = selected_attn_weights * (
+                1.0 / selected_attn_weights.sum(dim=-1, keepdim=True)
+            )
 
         na = self.balancer_na(self.nonlin_attention(src, selected_attn_weights))
 
-        src = src + (na if self_attn_dropout_mask is None else na * self_attn_dropout_mask)
+        src = src + (
+            na if self_attn_dropout_mask is None else na * self_attn_dropout_mask
+        )
 
         self_attn = self.self_attn1(src, attn_weights)
 
-        src = src + (self_attn if self_attn_dropout_mask is None else self_attn * self_attn_dropout_mask)
+        src = src + (
+            self_attn
+            if self_attn_dropout_mask is None
+            else self_attn * self_attn_dropout_mask
+        )
 
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             conv_skip_rate = 0.0
         else:
             conv_skip_rate = float(self.conv_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.conv_module1(src, chunk_size=chunk_size,
-                                                            src_key_padding_mask=src_key_padding_mask),
-                                          conv_skip_rate)
+        src = src + self.sequence_dropout(
+            self.conv_module1(
+                src, chunk_size=chunk_size, src_key_padding_mask=src_key_padding_mask
+            ),
+            conv_skip_rate,
+        )
 
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             ff2_skip_rate = 0.0
         else:
             ff2_skip_rate = float(self.ff2_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.balancer_ff2(self.feed_forward2(src)),
-                                          ff2_skip_rate)
+        src = src + self.sequence_dropout(
+            self.balancer_ff2(self.feed_forward2(src)), ff2_skip_rate
+        )
 
         # bypass in the middle of the layer.
         src = self.bypass_mid(src_orig, src)
 
         self_attn = self.self_attn2(src, attn_weights)
 
-        src = src + (self_attn if self_attn_dropout_mask is None else self_attn * self_attn_dropout_mask)
+        src = src + (
+            self_attn
+            if self_attn_dropout_mask is None
+            else self_attn * self_attn_dropout_mask
+        )
 
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             conv_skip_rate = 0.0
         else:
             conv_skip_rate = float(self.conv_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.conv_module2(src, chunk_size=chunk_size,
-                                                            src_key_padding_mask=src_key_padding_mask),
-                                          conv_skip_rate)
+        src = src + self.sequence_dropout(
+            self.conv_module2(
+                src, chunk_size=chunk_size, src_key_padding_mask=src_key_padding_mask
+            ),
+            conv_skip_rate,
+        )
 
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             ff3_skip_rate = 0.0
         else:
             ff3_skip_rate = float(self.ff3_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.balancer_ff3(self.feed_forward3(src)),
-                                          ff3_skip_rate)
+        src = src + self.sequence_dropout(
+            self.balancer_ff3(self.feed_forward3(src)), ff3_skip_rate
+        )
 
         src = self.balancer1(src)
         src = self.norm(src)
@@ -912,20 +999,22 @@ class Zipformer2Encoder(nn.Module):
         >>> src = torch.rand(10, 32, 512)
         >>> out = zipformer_encoder(src)
     """
+
     def __init__(
-            self,
-            encoder_layer: nn.Module,
-            num_layers: int,
-            pos_dim: int,
-            dropout: float,
-            warmup_begin: float,
-            warmup_end: float,
-            initial_layerdrop_rate: float = 0.5,
-            final_layerdrop_rate: float = 0.05,
+        self,
+        encoder_layer: nn.Module,
+        num_layers: int,
+        pos_dim: int,
+        dropout: float,
+        warmup_begin: float,
+        warmup_end: float,
+        initial_layerdrop_rate: float = 0.5,
+        final_layerdrop_rate: float = 0.05,
     ) -> None:
         super().__init__()
-        self.encoder_pos = CompactRelPositionalEncoding(pos_dim, dropout_rate=0.15,
-                                                        length_factor=1.0)
+        self.encoder_pos = CompactRelPositionalEncoding(
+            pos_dim, dropout_rate=0.15, length_factor=1.0
+        )
 
         self.layers = nn.ModuleList(
             [copy.deepcopy(encoder_layer) for i in range(num_layers)]
@@ -934,13 +1023,15 @@ class Zipformer2Encoder(nn.Module):
 
         assert 0 <= warmup_begin <= warmup_end
 
-        delta = (1. / num_layers) * (warmup_end - warmup_begin)
+        delta = (1.0 / num_layers) * (warmup_end - warmup_begin)
         cur_begin = warmup_begin  # interpreted as a training batch index
         for i in range(num_layers):
             cur_end = cur_begin + delta
-            self.layers[i].bypass.skip_rate = ScheduledFloat((cur_begin, initial_layerdrop_rate),
-                                                             (cur_end, final_layerdrop_rate),
-                                                             default=0.0)
+            self.layers[i].bypass.skip_rate = ScheduledFloat(
+                (cur_begin, initial_layerdrop_rate),
+                (cur_end, final_layerdrop_rate),
+                default=0.0,
+            )
             cur_begin = cur_end
 
     def forward(
@@ -1014,8 +1105,13 @@ class Zipformer2Encoder(nn.Module):
         new_states = []
         for i, mod in enumerate(self.layers):
             (
-                cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2
-            ) = states[i * 6: (i + 1) * 6]
+                cached_key,
+                cached_nonlin_attn,
+                cached_val1,
+                cached_val2,
+                cached_conv1,
+                cached_conv2,
+            ) = states[i * 6 : (i + 1) * 6]
             (
                 output,
                 new_cached_key,
@@ -1023,7 +1119,7 @@ class Zipformer2Encoder(nn.Module):
                 new_cached_val1,
                 new_cached_val2,
                 new_cached_conv1,
-                new_cached_conv2
+                new_cached_conv2,
             ) = mod.streaming_forward(
                 output,
                 pos_emb,
@@ -1055,13 +1151,15 @@ class BypassModule(nn.Module):
     "straight-through", i.e. to not do the bypass operation much initially, in order to
     force all the modules to learn something.
     """
+
     def __init__(
-            self,
-            embed_dim: int,
-            skip_rate: FloatLike = 0.0,
-            straight_through_rate: FloatLike = 0.0,
-            scale_min: FloatLike = ScheduledFloat((0.0, 0.9), (20000.0, 0.2), default=0),
-            scale_max: FloatLike = 1.0):
+        self,
+        embed_dim: int,
+        skip_rate: FloatLike = 0.0,
+        straight_through_rate: FloatLike = 0.0,
+        scale_min: FloatLike = ScheduledFloat((0.0, 0.9), (20000.0, 0.2), default=0),
+        scale_max: FloatLike = 1.0,
+    ):
         super().__init__()
         self.bypass_scale = nn.Parameter(torch.full((embed_dim,), 0.5))
         self.skip_rate = copy.deepcopy(skip_rate)
@@ -1077,9 +1175,9 @@ class BypassModule(nn.Module):
         if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
             return self.bypass_scale
         else:
-            ans = limit_param_value(self.bypass_scale,
-                                    min=float(self.scale_min),
-                                    max=float(self.scale_max))
+            ans = limit_param_value(
+                self.bypass_scale, min=float(self.scale_min), max=float(self.scale_max)
+            )
             skip_rate = float(self.skip_rate)
             if skip_rate != 0.0:
                 mask = torch.rand((batch_size, 1), device=ans.device) > skip_rate
@@ -1088,13 +1186,14 @@ class BypassModule(nn.Module):
                 # on which we have randomly chosen to do layer-skipping.
             straight_through_rate = float(self.straight_through_rate)
             if straight_through_rate != 0.0:
-                mask = torch.rand((batch_size, 1), device=ans.device) < straight_through_rate
+                mask = (
+                    torch.rand((batch_size, 1), device=ans.device)
+                    < straight_through_rate
+                )
                 ans = torch.maximum(ans, mask.to(ans.dtype))
             return ans
 
-    def forward(self,
-                src_orig: Tensor,
-                src: Tensor):
+    def forward(self, src_orig: Tensor, src: Tensor):
         """
         Args: src_orig and src are both of shape (seq_len, batch_size, num_channels)
         Returns: something with the same shape as src and src_orig
@@ -1109,15 +1208,13 @@ class DownsampledZipformer2Encoder(nn.Module):
     after convolutional downsampling, and then upsampled again at the output, and combined
     with the origin input, so that the output has the same shape as the input.
     """
-    def __init__(self,
-                 encoder: nn.Module,
-                 dim: int,
-                 downsample: int,
-                 dropout: FloatLike):
+
+    def __init__(
+        self, encoder: nn.Module, dim: int, downsample: int, dropout: FloatLike
+    ):
         super(DownsampledZipformer2Encoder, self).__init__()
         self.downsample_factor = downsample
-        self.downsample = SimpleDownsample(dim,
-                                           downsample, dropout)
+        self.downsample = SimpleDownsample(dim, downsample, dropout)
         self.num_layers = encoder.num_layers
         self.encoder = encoder
         self.upsample = SimpleUpsample(dim, downsample)
@@ -1149,7 +1246,7 @@ class DownsampledZipformer2Encoder(nn.Module):
         src = self.downsample(src)
         ds = self.downsample_factor
         if attn_mask is not None:
-            attn_mask = attn_mask[::ds,::ds]
+            attn_mask = attn_mask[::ds, ::ds]
 
         src = self.encoder(
             src,
@@ -1160,7 +1257,7 @@ class DownsampledZipformer2Encoder(nn.Module):
         )
         src = self.upsample(src)
         # remove any extra frames that are not a multiple of downsample_factor
-        src = src[:src_orig.shape[0]]
+        src = src[: src_orig.shape[0]]
 
         return self.out_combiner(src_orig, src)
 
@@ -1196,7 +1293,7 @@ class DownsampledZipformer2Encoder(nn.Module):
         )
         src = self.upsample(src)
         # remove any extra frames that are not a multiple of downsample_factor
-        src = src[:src_orig.shape[0]]
+        src = src[: src_orig.shape[0]]
 
         return self.out_combiner(src_orig, src), new_states
 
@@ -1205,10 +1302,8 @@ class SimpleDownsample(torch.nn.Module):
     """
     Does downsampling with attention, by weighted sum, and a projection..
     """
-    def __init__(self,
-                 channels: int,
-                 downsample: int,
-                 dropout: FloatLike):
+
+    def __init__(self, channels: int, downsample: int, dropout: FloatLike):
         super(SimpleDownsample, self).__init__()
 
         self.bias = nn.Parameter(torch.zeros(downsample))
@@ -1218,8 +1313,7 @@ class SimpleDownsample(torch.nn.Module):
 
         self.downsample = downsample
 
-    def forward(self,
-                src: Tensor) -> Tensor:
+    def forward(self, src: Tensor) -> Tensor:
         """
         x: (seq_len, batch_size, in_channels)
         Returns a tensor of shape
@@ -1232,7 +1326,7 @@ class SimpleDownsample(torch.nn.Module):
         # Pad to an exact multiple of self.downsample
         # right-pad src, repeating the last element.
         pad = d_seq_len * ds - seq_len
-        src_extra = src[src.shape[0]-1:].expand(pad, src.shape[1], src.shape[2])
+        src_extra = src[src.shape[0] - 1 :].expand(pad, src.shape[1], src.shape[2])
         src = torch.cat((src, src_extra), dim=0)
         assert src.shape[0] == d_seq_len * ds
 
@@ -1253,14 +1347,12 @@ class SimpleUpsample(torch.nn.Module):
     A very simple form of upsampling that mostly just repeats the input, but
     also adds a position-specific bias.
     """
-    def __init__(self,
-                 num_channels: int,
-                 upsample: int):
+
+    def __init__(self, num_channels: int, upsample: int):
         super(SimpleUpsample, self).__init__()
         self.upsample = upsample
 
-    def forward(self,
-                src: Tensor) -> Tensor:
+    def forward(self, src: Tensor) -> Tensor:
         """
         x: (seq_len, batch_size, num_channels)
         Returns a tensor of shape
@@ -1298,11 +1390,13 @@ class CompactRelPositionalEncoding(torch.nn.Module):
         length_factor: a heuristic scale (should be >= 1.0) which, if larger, gives
            less weight to small differences of offset near the origin.
     """
+
     def __init__(
-        self, embed_dim: int,
-            dropout_rate: FloatLike,
-            max_len: int = 1000,
-            length_factor: float = 1.0,
+        self,
+        embed_dim: int,
+        dropout_rate: FloatLike,
+        max_len: int = 1000,
+        length_factor: float = 1.0,
     ) -> None:
         """Construct a CompactRelPositionalEncoding object."""
         super(CompactRelPositionalEncoding, self).__init__()
@@ -1326,19 +1420,22 @@ class CompactRelPositionalEncoding(torch.nn.Module):
                 return
 
         # if T == 4, x would contain [ -3, -2, 1, 0, 1, 2, 3 ]
-        x = torch.arange(-(T-1), T,
-                         device=x.device).to(torch.float32).unsqueeze(1)
+        x = torch.arange(-(T - 1), T, device=x.device).to(torch.float32).unsqueeze(1)
 
         freqs = 1 + torch.arange(self.embed_dim // 2, device=x.device)
 
         # `compression_length` this is arbitrary/heuristic, if it is larger we have more resolution
         # for small time offsets but less resolution for large time offsets.
-        compression_length = (self.embed_dim ** 0.5)
+        compression_length = self.embed_dim**0.5
         # x_compressed, like X, goes from -infinity to infinity as T goes from -infinity to infinity;
         # but it does so more slowly than T for large absolute values of T.
         # The formula is chosen so that d(x_compressed )/dx is 1 around x == 0, which
         # is important.
-        x_compressed = compression_length * x.sign() * ((x.abs() + compression_length).log() - math.log(compression_length))
+        x_compressed = (
+            compression_length
+            * x.sign()
+            * ((x.abs() + compression_length).log() - math.log(compression_length))
+        )
 
         # if self.length_factor == 1.0, then length_scale is chosen so that the
         # FFT can exactly separate points close to the origin (T == 0).  So this
@@ -1380,7 +1477,7 @@ class CompactRelPositionalEncoding(torch.nn.Module):
             - x_size_left
             + 1 : self.pe.size(0) // 2  # noqa E203
             + x.size(0),
-            :
+            :,
         ]
         pos_emb = pos_emb.unsqueeze(0)
         return self.dropout(pos_emb)
@@ -1407,15 +1504,14 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
     """
 
     def __init__(
-            self,
-            embed_dim: int,
-            pos_dim: int,
-            num_heads: int,
-            query_head_dim: int,
-            pos_head_dim: int,
-            dropout: float = 0.0,
-            pos_emb_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5),
-                                                          (4000.0, 0.0))
+        self,
+        embed_dim: int,
+        pos_dim: int,
+        num_heads: int,
+        query_head_dim: int,
+        pos_head_dim: int,
+        dropout: float = 0.0,
+        pos_emb_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), (4000.0, 0.0)),
     ) -> None:
         super().__init__()
         self.embed_dim = embed_dim
@@ -1434,13 +1530,16 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # dividing it between the query and key.   Note: this module is intended
         # to be used with the ScaledAdam optimizer; with most other optimizers,
         # it would be necessary to apply the scaling factor in the forward function.
-        self.in_proj = ScaledLinear(embed_dim, in_proj_dim, bias=True,
-                                    initial_scale=query_head_dim**-0.25)
+        self.in_proj = ScaledLinear(
+            embed_dim, in_proj_dim, bias=True, initial_scale=query_head_dim**-0.25
+        )
 
-        self.whiten_keys = Whiten(num_groups=num_heads,
-                                  whitening_limit=_whitening_schedule(3.0),
-                                  prob=(0.025, 0.25),
-                                  grad_scale=0.025)
+        self.whiten_keys = Whiten(
+            num_groups=num_heads,
+            whitening_limit=_whitening_schedule(3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.025,
+        )
 
         # add a balancer for the keys that runs with very small probability, and
         # tries to enforce that all dimensions have mean around zero.  The
@@ -1450,19 +1549,20 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # bias because the small numerical roundoff tends to have a non-random
         # sign.  This module is intended to prevent that.  Use a very small
         # probability; that should be suffixient to fix the problem.
-        self.balance_keys = Balancer(key_head_dim * num_heads,
-                                     channel_dim=-1,
-                                     min_positive=0.4,
-                                     max_positive=0.6,
-                                     min_abs=0.0,
-                                     max_abs=100.0,
-                                     prob=0.025)
+        self.balance_keys = Balancer(
+            key_head_dim * num_heads,
+            channel_dim=-1,
+            min_positive=0.4,
+            max_positive=0.6,
+            min_abs=0.0,
+            max_abs=100.0,
+            prob=0.025,
+        )
 
         # linear transformation for positional encoding.
-        self.linear_pos = ScaledLinear(pos_dim,
-                                       num_heads * pos_head_dim,
-                                       bias=False,
-                                       initial_scale=0.05)
+        self.linear_pos = ScaledLinear(
+            pos_dim, num_heads * pos_head_dim, bias=False, initial_scale=0.05
+        )
 
         # the following are for diagnosics only, see --print-diagnostics option
         self.copy_pos_query = Identity()
@@ -1498,10 +1598,10 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         query_dim = query_head_dim * num_heads
 
         # self-attention
-        q = x[...,0:query_dim]
-        k = x[...,query_dim:2*query_dim]
+        q = x[..., 0:query_dim]
+        k = x[..., query_dim : 2 * query_dim]
         # p is the position-encoding query
-        p = x[...,2*query_dim:]
+        p = x[..., 2 * query_dim :]
         assert p.shape[-1] == num_heads * pos_head_dim
 
         q = self.copy_query(q)  # for diagnostics only, does nothing.
@@ -1529,7 +1629,9 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         if use_pos_scores:
             pos_emb = self.linear_pos(pos_emb)
             seq_len2 = 2 * seq_len - 1
-            pos_emb = pos_emb.reshape(-1, seq_len2, num_heads, pos_head_dim).permute(2, 0, 3, 1)
+            pos_emb = pos_emb.reshape(-1, seq_len2, num_heads, pos_head_dim).permute(
+                2, 0, 3, 1
+            )
             # pos shape now: (head, {1 or batch_size}, pos_dim, seq_len2)
 
             # (head, batch, time1, pos_dim) x (head, 1, pos_dim, seq_len2) -> (head, batch, time1, seq_len2)
@@ -1548,12 +1650,16 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
                 pos_scores = torch.gather(pos_scores, dim=1, index=indexes)
                 pos_scores = pos_scores.reshape(num_heads, batch_size, time1, seq_len)
             else:
-                pos_scores = pos_scores.as_strided((num_heads, batch_size, seq_len, seq_len),
-                                                   (pos_scores.stride(0),
-                                                    pos_scores.stride(1),
-                                                    pos_scores.stride(2)-pos_scores.stride(3),
-                                                    pos_scores.stride(3)),
-                                                   storage_offset=pos_scores.stride(3) * (seq_len - 1))
+                pos_scores = pos_scores.as_strided(
+                    (num_heads, batch_size, seq_len, seq_len),
+                    (
+                        pos_scores.stride(0),
+                        pos_scores.stride(1),
+                        pos_scores.stride(2) - pos_scores.stride(3),
+                        pos_scores.stride(3),
+                    ),
+                    storage_offset=pos_scores.stride(3) * (seq_len - 1),
+                )
 
             attn_scores = attn_scores + pos_scores
 
@@ -1572,10 +1678,9 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
             # but we view this as a failsafe to avoid "implausible" parameter
             # values rather than a regularization method that should be active
             # under normal circumstances.
-            attn_scores = penalize_abs_values_gt(attn_scores,
-                                                 limit=25.0,
-                                                 penalty=1.0e-04,
-                                                 name=self.name)
+            attn_scores = penalize_abs_values_gt(
+                attn_scores, limit=25.0, penalty=1.0e-04, name=self.name
+            )
 
         assert attn_scores.shape == (num_heads, batch_size, seq_len, seq_len)
 
@@ -1588,7 +1693,10 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
             attn_scores = attn_scores.masked_fill(attn_mask, -1000)
 
         if key_padding_mask is not None:
-            assert key_padding_mask.shape == (batch_size, seq_len), key_padding_mask.shape
+            assert key_padding_mask.shape == (
+                batch_size,
+                seq_len,
+            ), key_padding_mask.shape
             attn_scores = attn_scores.masked_fill(
                 key_padding_mask.unsqueeze(1),
                 -1000,
@@ -1644,14 +1752,17 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         query_dim = query_head_dim * num_heads
 
         # self-attention
-        q = x[...,0:query_dim]
-        k = x[...,query_dim:2*query_dim]
+        q = x[..., 0:query_dim]
+        k = x[..., query_dim : 2 * query_dim]
         # p is the position-encoding query
-        p = x[...,2*query_dim:]
+        p = x[..., 2 * query_dim :]
         assert p.shape[-1] == num_heads * pos_head_dim
 
         # Pad cached left contexts
-        assert cached_key.shape[0] == left_context_len, (cached_key.shape[0], left_context_len)
+        assert cached_key.shape[0] == left_context_len, (
+            cached_key.shape[0],
+            left_context_len,
+        )
         k = torch.cat([cached_key, k], dim=0)
         # Update cached left contexts
         cached_key = k[-left_context_len:, ...]
@@ -1672,13 +1783,15 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
 
         pos_emb = self.linear_pos(pos_emb)
         seq_len2 = 2 * seq_len - 1 + left_context_len
-        pos_emb = pos_emb.reshape(-1, seq_len2, num_heads, pos_head_dim).permute(2, 0, 3, 1)
+        pos_emb = pos_emb.reshape(-1, seq_len2, num_heads, pos_head_dim).permute(
+            2, 0, 3, 1
+        )
         # pos shape now: (head, {1 or batch_size}, pos_dim, seq_len2)
 
         # (head, batch, time1, pos_dim) x (head, 1, pos_dim, seq_len2) -> (head, batch, time1, seq_len2)
         #  [where seq_len2 represents relative position.]
         pos_scores = torch.matmul(p, pos_emb)
-        
+
         if torch.jit.is_tracing():
             (num_heads, batch_size, time1, n) = pos_scores.shape
             rows = torch.arange(start=time1 - 1, end=-1, step=-1)
@@ -1692,16 +1805,25 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # to absolute position.  I don't know whether I might have got the time-offsets backwards or
         # not, but let this code define which way round it is supposed to be.
         else:
-            pos_scores = pos_scores.as_strided((num_heads, batch_size, seq_len, k_len),
-                                            (pos_scores.stride(0),
-                                                pos_scores.stride(1),
-                                                pos_scores.stride(2)-pos_scores.stride(3),
-                                                pos_scores.stride(3)),
-                                            storage_offset=pos_scores.stride(3) * (seq_len - 1))
+            pos_scores = pos_scores.as_strided(
+                (num_heads, batch_size, seq_len, k_len),
+                (
+                    pos_scores.stride(0),
+                    pos_scores.stride(1),
+                    pos_scores.stride(2) - pos_scores.stride(3),
+                    pos_scores.stride(3),
+                ),
+                storage_offset=pos_scores.stride(3) * (seq_len - 1),
+            )
 
         attn_scores = attn_scores + pos_scores
 
-        assert attn_scores.shape == (num_heads, batch_size, seq_len, k_len), attn_scores.shape
+        assert attn_scores.shape == (
+            num_heads,
+            batch_size,
+            seq_len,
+            k_len,
+        ), attn_scores.shape
 
         if key_padding_mask is not None:
             assert key_padding_mask.shape == (batch_size, k_len), key_padding_mask.shape
@@ -1714,18 +1836,21 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
 
         return attn_weights, cached_key
 
-    def _print_attn_entropy(
-            self,
-            attn_weights: Tensor):
+    def _print_attn_entropy(self, attn_weights: Tensor):
         # attn_weights: (num_heads, batch_size, seq_len, seq_len)
         (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape
 
         with torch.no_grad():
             with torch.cuda.amp.autocast(enabled=False):
                 attn_weights = attn_weights.to(torch.float32)
-                attn_weights_entropy = -((attn_weights + 1.0e-20).log() * attn_weights).sum(
-                    dim=-1).mean(dim=(1,2))
-                logging.info(f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}")
+                attn_weights_entropy = (
+                    -((attn_weights + 1.0e-20).log() * attn_weights)
+                    .sum(dim=-1)
+                    .mean(dim=(1, 2))
+                )
+                logging.info(
+                    f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}"
+                )
 
 
 class SelfAttention(nn.Module):
@@ -1738,25 +1863,26 @@ class SelfAttention(nn.Module):
           num_heads: the number of attention heads
           value_head_dim: the value dimension per head
     """
+
     def __init__(
-            self,
-            embed_dim: int,
-            num_heads: int,
-            value_head_dim: int,
+        self,
+        embed_dim: int,
+        num_heads: int,
+        value_head_dim: int,
     ) -> None:
         super().__init__()
-        self.in_proj = nn.Linear(embed_dim,
-                                 num_heads * value_head_dim,
-                                 bias=True)
+        self.in_proj = nn.Linear(embed_dim, num_heads * value_head_dim, bias=True)
 
-        self.out_proj = ScaledLinear(num_heads * value_head_dim,
-                                     embed_dim, bias=True,
-                                     initial_scale=0.05)
+        self.out_proj = ScaledLinear(
+            num_heads * value_head_dim, embed_dim, bias=True, initial_scale=0.05
+        )
 
-        self.whiten = Whiten(num_groups=1,
-                             whitening_limit=_whitening_schedule(7.5, ratio=3.0),
-                             prob=(0.025, 0.25),
-                             grad_scale=0.01)
+        self.whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(7.5, ratio=3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
     def forward(
         self,
@@ -1785,8 +1911,11 @@ class SelfAttention(nn.Module):
         x = torch.matmul(attn_weights, x)
         # v: (num_heads, batch_size, seq_len, value_head_dim)
 
-        x = x.permute(2, 1, 0, 3).contiguous().view(
-            seq_len, batch_size, num_heads * value_head_dim)
+        x = (
+            x.permute(2, 1, 0, 3)
+            .contiguous()
+            .view(seq_len, batch_size, num_heads * value_head_dim)
+        )
 
         # returned value is of shape (seq_len, batch_size, embed_dim), like the input.
         x = self.out_proj(x)
@@ -1823,7 +1952,10 @@ class SelfAttention(nn.Module):
         x = self.in_proj(x)  # (seq_len, batch_size, num_heads * value_head_dim)
 
         # Pad cached left contexts
-        assert cached_val.shape[0] == left_context_len, (cached_val.shape[0], left_context_len)
+        assert cached_val.shape[0] == left_context_len, (
+            cached_val.shape[0],
+            left_context_len,
+        )
         x = torch.cat([cached_val, x], dim=0)
         # Update cached left contexts
         cached_val = x[-left_context_len:, ...]
@@ -1836,8 +1968,11 @@ class SelfAttention(nn.Module):
         x = torch.matmul(attn_weights, x)
         # v: (num_heads, batch_size, seq_len, value_head_dim)
 
-        x = x.permute(2, 1, 0, 3).contiguous().view(
-            seq_len, batch_size, num_heads * value_head_dim)
+        x = (
+            x.permute(2, 1, 0, 3)
+            .contiguous()
+            .view(seq_len, batch_size, num_heads * value_head_dim)
+        )
 
         # returned value is of shape (seq_len, batch_size, embed_dim), like the input.
         x = self.out_proj(x)
@@ -1846,33 +1981,38 @@ class SelfAttention(nn.Module):
 
 
 class FeedforwardModule(nn.Module):
-    """Feedforward module in Zipformer2 model.
-    """
-    def __init__(self,
-                 embed_dim: int,
-                 feedforward_dim: int,
-                 dropout: FloatLike):
+    """Feedforward module in Zipformer2 model."""
+
+    def __init__(self, embed_dim: int, feedforward_dim: int, dropout: FloatLike):
         super(FeedforwardModule, self).__init__()
         self.in_proj = nn.Linear(embed_dim, feedforward_dim)
 
-        self.hidden_balancer = Balancer(feedforward_dim,
-                                        channel_dim=-1,
-                                        min_positive=0.3,
-                                        max_positive=1.0,
-                                        min_abs=0.75,
-                                        max_abs=5.0)
+        self.hidden_balancer = Balancer(
+            feedforward_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=1.0,
+            min_abs=0.75,
+            max_abs=5.0,
+        )
 
         # shared_dim=0 means we share the dropout mask along the time axis
-        self.out_proj = ActivationDropoutAndLinear(feedforward_dim, embed_dim,
-                                                   activation='SwooshL',
-                                                   dropout_p=dropout,
-                                                   dropout_shared_dim=0, bias=True,
-                                                   initial_scale=0.1)
+        self.out_proj = ActivationDropoutAndLinear(
+            feedforward_dim,
+            embed_dim,
+            activation="SwooshL",
+            dropout_p=dropout,
+            dropout_shared_dim=0,
+            bias=True,
+            initial_scale=0.1,
+        )
 
-        self.out_whiten = Whiten(num_groups=1,
-                                 whitening_limit=_whitening_schedule(7.5),
-                                 prob=(0.025, 0.25),
-                                 grad_scale=0.01)
+        self.out_whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(7.5),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
     def forward(self, x: Tensor):
         x = self.in_proj(x)
@@ -1893,9 +2033,9 @@ class NonlinAttention(nn.Module):
     """
 
     def __init__(
-            self,
-            channels: int,
-            hidden_channels: int,
+        self,
+        channels: int,
+        hidden_channels: int,
     ) -> None:
         super().__init__()
 
@@ -1908,7 +2048,8 @@ class NonlinAttention(nn.Module):
         # starting from about 3, and poorly-trained instances of the module have smaller abs values
         # before the sigmoid.
         self.balancer = Balancer(
-            hidden_channels, channel_dim=-1,
+            hidden_channels,
+            channel_dim=-1,
             min_positive=ScheduledFloat((0.0, 0.25), (20000.0, 0.05)),
             max_positive=ScheduledFloat((0.0, 0.75), (20000.0, 0.95)),
             min_abs=0.5,
@@ -1920,19 +2061,23 @@ class NonlinAttention(nn.Module):
         self.identity2 = Identity()  # for diagnostics.
         self.identity3 = Identity()  # for diagnostics.
 
-        self.out_proj = ScaledLinear(hidden_channels, channels,
-                                     bias=True,
-                                     initial_scale=0.05)
+        self.out_proj = ScaledLinear(
+            hidden_channels, channels, bias=True, initial_scale=0.05
+        )
 
-        self.whiten1 = Whiten(num_groups=1,
-                              whitening_limit=_whitening_schedule(5.0),
-                              prob=(0.025, 0.25),
-                              grad_scale=0.01)
+        self.whiten1 = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(5.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
-        self.whiten2 = Whiten(num_groups=1,
-                              whitening_limit=_whitening_schedule(5.0, ratio=3.0),
-                              prob=(0.025, 0.25),
-                              grad_scale=0.01)
+        self.whiten2 = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(5.0, ratio=3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
     def forward(
         self,
@@ -1940,11 +2085,11 @@ class NonlinAttention(nn.Module):
         attn_weights: Tensor,
     ) -> Tensor:
         """.
-        Args:
-           x: a Tensor of shape (seq_len, batch_size, num_channels)
-attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
-        Returns:
-           a Tensor with the same shape as x
+                Args:
+                   x: a Tensor of shape (seq_len, batch_size, num_channels)
+        attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
+                Returns:
+                   a Tensor with the same shape as x
         """
         x = self.in_proj(x)
 
@@ -2014,13 +2159,21 @@ attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
 
         (seq_len, batch_size, embed_dim) = x.shape
         num_heads = attn_weights.shape[0]
-        assert attn_weights.shape == (num_heads, batch_size, seq_len, left_context_len + seq_len)
+        assert attn_weights.shape == (
+            num_heads,
+            batch_size,
+            seq_len,
+            left_context_len + seq_len,
+        )
 
         x = x.reshape(seq_len, batch_size, num_heads, -1).permute(2, 1, 0, 3)
         # now x: (num_heads, batch_size, seq_len, head_dim)
 
         # Pad cached tensor
-        assert cached_x.shape[2] == left_context_len, (cached_x.shape[2], left_context_len)
+        assert cached_x.shape[2] == left_context_len, (
+            cached_x.shape[2],
+            left_context_len,
+        )
         x_pad = torch.cat([cached_x, x], dim=2)
         # Update cached tensor
         cached_x = x_pad[:, :, -left_context_len:, :]
@@ -2045,8 +2198,12 @@ class ConvolutionModule(nn.Module):
         bias (bool): Whether to use bias in conv layers (default=True).
 
     """
+
     def __init__(
-            self, channels: int, kernel_size: int, causal: bool,
+        self,
+        channels: int,
+        kernel_size: int,
+        causal: bool,
     ) -> None:
         """Construct a ConvolutionModule object."""
         super(ConvolutionModule, self).__init__()
@@ -2057,7 +2214,8 @@ class ConvolutionModule(nn.Module):
         self.causal = causal
 
         self.in_proj = nn.Linear(
-            channels, 2 * bottleneck_dim,
+            channels,
+            2 * bottleneck_dim,
         )
         # the gradients on in_proj are a little noisy, likely to do with the
         # sigmoid in glu.
@@ -2076,7 +2234,8 @@ class ConvolutionModule(nn.Module):
         # it will be in a better position to start learning something, i.e. to latch onto
         # the correct range.
         self.balancer1 = Balancer(
-            bottleneck_dim, channel_dim=-1,
+            bottleneck_dim,
+            channel_dim=-1,
             min_positive=ScheduledFloat((0.0, 0.05), (8000.0, 0.025)),
             max_positive=1.0,
             min_abs=1.5,
@@ -2091,31 +2250,40 @@ class ConvolutionModule(nn.Module):
 
         assert kernel_size % 2 == 1
 
-        self.depthwise_conv = ChunkCausalDepthwiseConv1d(
-            channels=bottleneck_dim,
-            kernel_size=kernel_size) if causal else nn.Conv1d(
-            in_channels=bottleneck_dim,
-            out_channels=bottleneck_dim,
-            groups=bottleneck_dim,
-            kernel_size=kernel_size,
-            padding=kernel_size // 2)
+        self.depthwise_conv = (
+            ChunkCausalDepthwiseConv1d(channels=bottleneck_dim, kernel_size=kernel_size)
+            if causal
+            else nn.Conv1d(
+                in_channels=bottleneck_dim,
+                out_channels=bottleneck_dim,
+                groups=bottleneck_dim,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+            )
+        )
 
         self.balancer2 = Balancer(
-            bottleneck_dim, channel_dim=1,
+            bottleneck_dim,
+            channel_dim=1,
             min_positive=ScheduledFloat((0.0, 0.1), (8000.0, 0.05)),
             max_positive=1.0,
             min_abs=ScheduledFloat((0.0, 0.2), (20000.0, 0.5)),
             max_abs=10.0,
         )
 
-        self.whiten = Whiten(num_groups=1,
-                             whitening_limit=_whitening_schedule(7.5),
-                             prob=(0.025, 0.25),
-                             grad_scale=0.01)
+        self.whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(7.5),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
         self.out_proj = ActivationDropoutAndLinear(
-            bottleneck_dim, channels, activation='SwooshR',
-            dropout_p=0.0, initial_scale=0.05,
+            bottleneck_dim,
+            channels,
+            activation="SwooshR",
+            dropout_p=0.0,
+            initial_scale=0.05,
         )
 
     def forward(
@@ -2153,9 +2321,15 @@ class ConvolutionModule(nn.Module):
         if src_key_padding_mask is not None:
             x = x.masked_fill(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
 
-        if not torch.jit.is_scripting() and not torch.jit.is_tracing() and chunk_size >= 0:
+        if (
+            not torch.jit.is_scripting()
+            and not torch.jit.is_tracing()
+            and chunk_size >= 0
+        ):
             # Not support exporting a model for simulated streaming decoding
-            assert self.causal, "Must initialize model with causal=True if you use chunk_size"
+            assert (
+                self.causal
+            ), "Must initialize model with causal=True if you use chunk_size"
             x = self.depthwise_conv(x, chunk_size=chunk_size)
         else:
             x = self.depthwise_conv(x)
@@ -2225,10 +2399,12 @@ def _test_zipformer_main(causal: bool = False):
     # Just make sure the forward pass runs.
 
     c = Zipformer2(
-        encoder_dim=(64, 96), encoder_unmasked_dim=(48, 64), num_heads=(4, 4),
+        encoder_dim=(64, 96),
+        encoder_unmasked_dim=(48, 64),
+        num_heads=(4, 4),
         causal=causal,
         chunk_size=(4,) if causal else (-1,),
-        left_context_frames=(64,)
+        left_context_frames=(64,),
     )
     batch_size = 5
     seq_len = 20