diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/context_fuser.py b/egs/libriheavy/ASR/zipformer_prompt_asr/context_fuser.py
deleted file mode 100644
index aabae2bbf..000000000
--- a/egs/libriheavy/ASR/zipformer_prompt_asr/context_fuser.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-
-from scaling import ScaledLinear, softmax
-
-from icefall.utils import make_pad_mask
-
-class ContextFuser(nn.Module):
-    def __init__(
-        self,
-        embed_dim: int = 256,
-    ):
-        super().__init__()
-        
-        self.embed_dim = embed_dim
-    
-    def forward(
-        self,
-        context: torch.Tensor,
-        context_lens: torch.Tensor=None,
-        padding_mask: torch.Tensor=None
-    ) -> torch.Tensor:
-        """A module fusing the context embedding vectors
-
-        Args:
-            context (torch.Tensor): The context embeddings, (B,W,C)
-            context_lens (torch.Tensor): The length of context embeddings, (B,)
-
-        Returns:
-            torch.Tensor: The fused context embeddings, (B,C)
-        """
-        batch_size = context.size(0)
-        if padding_mask is None:
-            assert context_lens is not None
-            padding_mask = make_pad_mask(context_lens).unsqueeze(-1)
-        else:
-            if padding_mask.ndim != 3:
-                padding_mask = padding_mask.unsqueeze(-1)
-        context.masked_fill_(padding_mask, 0)
-        
-        if context_lens is None:
-            max_len = padding_mask.size(1)
-            context_lens = max_len - padding_mask.sum(dim=1) + 1e-5 # to prevent 0
-        
-        # by a small probability, dropout the context of a few samples
-        context_dropout_rate = 0.05
-        m = torch.rand((batch_size, 1, 1), device=context.device) > context_dropout_rate
-        context = context * m 
-        
-        # average the context
-        context = context.sum(dim=1)/context_lens
-        
-        return context
-    
-class SelfAttContextFuser(nn.Module):
-    def __init__(
-        self,
-        embed_dim: int = 384,
-        query_head_dim: int = 128,
-        nhead: int=4,
-        context_dropout_rate: float=0.05,
-    ):
-        """ContextFuser with multi-head self-attention
-
-        Args:
-            embed_dim (int, optional): The input embedding dim. Defaults to 256.
-            nhead (int, optional): The number of heads. Defaults to 4.
-        """
-        
-        super().__init__()
-        
-        self.embed_dim = embed_dim 
-        self.nhead = nhead
-        self.query_head_dim = query_head_dim
-        
-        self.in_proj = ScaledLinear(embed_dim, nhead * query_head_dim)
-        self.weight_proj = ScaledLinear(nhead * query_head_dim, nhead)
-        self.context_dropout_rate = context_dropout_rate
-    
-    def forward(
-        self,
-        context: torch.Tensor,
-        context_lens: torch.Tensor=None,
-        padding_mask: torch.Tensor=None,
-    ) -> torch.Tensor:
-        """A module fusing the context embedding vectors
-
-        Args:
-            context (torch.Tensor): The context embeddings, (B,W,C)
-            context_lens (torch.Tensor): The length of context embeddings, (B,)
-            padding_mask (torch.Tensor): A padding mask (B,W)
-
-        Returns:
-            torch.Tensor: The fused context embeddings, (B,C)
-        """
-        batch_size = context.size(0)
-        if padding_mask is None:
-            assert context_lens is not None
-            padding_mask = make_pad_mask(context_lens).unsqueeze(-1)
-        else:
-            if padding_mask.ndim != 3:
-                padding_mask = padding_mask.unsqueeze(-1)
-        # context.masked_fill_(padding_mask, 0) 
-        
-        if context_lens is None:
-            max_len = padding_mask.size(1)
-            context_lens = max_len - padding_mask.sum(dim=1) + 1e-5 # to prevent 0
-            
-        k = self.in_proj(context) # (B,W,C)
-        w = self.weight_proj(torch.tanh(k)) # (B,W,num_heads)
-        
-        w.masked_fill_(padding_mask, -1000)
-        w = softmax(w, dim=1) # (B,W,num_heads)
-        w = w.permute(0,2,1) # (B,num_heads, W)
-        
-        # reweight and concat the context embeddings
-        context = torch.matmul(w, context).view(batch_size, -1) # (B, num_heads * C)
-        
-        # by a small probability, dropout the context of a few samples
-        if self.training:
-            m = torch.rand((batch_size, 1), device=context.device) > self.context_dropout_rate
-            context = context * m
-        #context = context * 0.0
-        
-        return context
\ No newline at end of file
diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py b/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py
index 39662e806..7075c9154 100644
--- a/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py
+++ b/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py
@@ -20,26 +20,29 @@
 """
 Usage:
 
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-./pruned_transducer_stateless7/train.py \
-  --world-size 4 \
-  --num-epochs 30 \
-  --start-epoch 1 \
-  --exp-dir pruned_transducer_stateless7/exp \
-  --full-libri 1 \
-  --max-duration 300
 
 # For mix precision training:
 
-./pruned_transducer_stateless7/train.py \
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./zipformer/train.py \
   --world-size 4 \
   --num-epochs 30 \
   --start-epoch 1 \
   --use-fp16 1 \
-  --exp-dir pruned_transducer_stateless7/exp \
-  --full-libri 1 \
-  --max-duration 550
+  --exp-dir zipformer/exp \
+  --max-duration 1000
+
+# To train a streaming model
+
+./zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --causal 1
+  --exp-dir zipformer/exp \
+  --max-duration 1000
 
 """
 
@@ -54,7 +57,6 @@ from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import k2
-import numpy
 import optim
 import sentencepiece as spm
 import torch
@@ -70,11 +72,11 @@ from model_baseline import Transducer
 from optim import Eden, ScaledAdam
 from scaling import ScheduledFloat
 from subsampling import Conv2dSubsampling
+from text_normalization import train_text_normalization, upper_only_alpha
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
-from text_normalization import train_text_normalization, upper_only_alpha
 from zipformer import Zipformer2
 
 from icefall import diagnostics
@@ -95,41 +97,28 @@ from icefall.utils import (
     str2bool,
 )
 
-LRSchedulerType = Union[
-    torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler
-]
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 
 
-def random_sampling(texts: List[str]) -> str:
-    return random.choice(texts)
-
-def joint_random_sampling(texts: List[str], pre_texts: List[str]) -> str:
-    i = random.randint(0, 1)
-    out = {
-        "text": texts[i],
-        "pre_text": pre_texts[i]
-    }
-    return out
-
 def get_first(
     texts: List[str],
     pre_texts: List[str],
     context_list: Optional[str] = None,
     rare_word_list: Optional[List[str]] = None,
 ) -> str:
-    out = {
-        "text": texts[0],
-        "pre_text": pre_texts[0]
-    }
+    # Always get the first one, which is the mixed-cased text with punc
+    out = {"text": texts[0], "pre_text": pre_texts[0]}
     return out
 
+
 def get_upper_only_alpha(
     texts: List[str],
     pre_texts: List[str],
     context_list: Optional[str] = None,
     rare_word_list: Optional[List[str]] = None,
 ) -> str:
-    # Always get the first one, which is the gt (mixed-cased trans), but with upper_only_alpha
+    # Always get the first one, which is the mixed-cased text with punc,
+    # but with upper case it and remove punctuation
     out = {
         "text": upper_only_alpha(texts[0]),
         "pre_text": upper_only_alpha(pre_texts[0]),
@@ -252,13 +241,12 @@ def add_model_arguments(parser: argparse.ArgumentParser):
         default=512,
         help="Embedding dimension in the decoder model.",
     )
-    
+
     parser.add_argument(
         "--context-size",
         type=int,
         default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
     )
 
     parser.add_argument(
@@ -413,8 +401,7 @@ def get_parser():
         "--am-scale",
         type=float,
         default=0.0,
-        help="The scale to smooth the loss with am (output of encoder network)"
-        "part.",
+        help="The scale to smooth the loss with am (output of encoder network)" "part.",
     )
 
     parser.add_argument(
@@ -582,7 +569,8 @@ def get_encoder_embed(params: AttributeDict) -> nn.Module:
         dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
     )
     return encoder_embed
-  
+
+
 def get_encoder_model(params: AttributeDict) -> nn.Module:
     encoder = Zipformer2(
         output_downsampling_factor=2,
@@ -789,11 +777,7 @@ def compute_loss(
      warmup: a floating point value which increases throughout training;
         values >= 1.0 are fully warmed up and have all modules present.
     """
-    device = (
-        model.device
-        if isinstance(model, DDP)
-        else next(model.parameters()).device
-    )
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
     feature = batch["inputs"]
     # at entry, feature is (N, T, C)
     assert feature.ndim == 3
@@ -809,7 +793,7 @@ def compute_loss(
     texts = [train_text_normalization(t) for t in texts]
     y = sp.encode(texts, out_type=int)
     y = k2.RaggedTensor(y).to(device)
-    
+
     if random.random() < 0.02:
         logging.info(f"Ref texts: {texts[0]}")
 
@@ -844,9 +828,7 @@ def compute_loss(
     info = MetricsTracker()
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
-        info["frames"] = (
-            (feature_lens // params.subsampling_factor).sum().item()
-        )
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
 
     # Note: We use reduction=sum while computing the loss.
     info["loss"] = loss.detach().cpu().item()
@@ -1035,9 +1017,7 @@ def train_one_epoch(
             # behavior depending on the current grad scale.
             cur_grad_scale = scaler._scale.item()
 
-            if cur_grad_scale < 8.0 or (
-                cur_grad_scale < 32.0 and batch_idx % 400 == 0
-            ):
+            if cur_grad_scale < 8.0 or (cur_grad_scale < 32.0 and batch_idx % 400 == 0):
                 scaler.update(cur_grad_scale * 2.0)
             if cur_grad_scale < 0.01:
                 if not saved_bad_model:
@@ -1059,11 +1039,7 @@ def train_one_epoch(
                 f"batch {batch_idx}, loss[{loss_info}], "
                 f"tot_loss[{tot_loss}], batch size: {batch_size}, "
                 f"lr: {cur_lr:.2e}, "
-                + (
-                    f"grad_scale: {scaler._scale.item()}"
-                    if params.use_fp16
-                    else ""
-                )
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
             )
 
             if tb_writer is not None:
@@ -1074,9 +1050,7 @@ def train_one_epoch(
                 loss_info.write_summary(
                     tb_writer, "train/current_", params.batch_idx_train
                 )
-                tot_loss.write_summary(
-                    tb_writer, "train/tot_", params.batch_idx_train
-                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
                 if params.use_fp16:
                     tb_writer.add_scalar(
                         "train/grad_scale",
@@ -1084,10 +1058,7 @@ def train_one_epoch(
                         params.batch_idx_train,
                     )
 
-        if (
-            batch_idx % params.valid_interval == 0
-            and not params.print_diagnostics
-        ):
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
             logging.info("Computing validation loss")
             valid_info = compute_validation_loss(
                 params=params,
@@ -1177,9 +1148,7 @@ def run(rank, world_size, args):
         model = DDP(model, device_ids=[rank], find_unused_parameters=True)
 
     optimizer = ScaledAdam(
-        get_parameter_groups_with_lrs(
-            model, lr=params.base_lr, include_names=True
-        ),
+        get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
         lr=params.base_lr,  # should have no effect
         clipping_scale=2.0,
     )
@@ -1200,7 +1169,7 @@ def run(rank, world_size, args):
 
     if params.print_diagnostics:
         opts = diagnostics.TensorDiagnosticOptions(
-            2 ** 22
+            2**22
         )  # allow 4 megabytes per sub-module
         diagnostic = diagnostics.attach_diagnostics(model, opts)
 
@@ -1210,7 +1179,7 @@ def run(rank, world_size, args):
     libriheavy = LibriHeavyAsrDataModule(args)
 
     train_cuts = libriheavy.train_cuts()
-    
+
     def remove_short_and_long_utt(c: Cut):
         # Keep only utterances with duration between 1 second and 20 seconds
         #
@@ -1265,14 +1234,14 @@ def run(rank, world_size, args):
     valid_cuts = libriheavy.dev_cuts()
     valid_dl = libriheavy.valid_dataloaders(valid_cuts)
 
-    # if not params.print_diagnostics:
-    #     scan_pessimistic_batches_for_oom(
-    #         model=model,
-    #         train_dl=train_dl,
-    #         optimizer=optimizer,
-    #         sp=sp,
-    #         params=params,
-    #     )
+    if not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            sp=sp,
+            params=params,
+        )
 
     scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
     if checkpoints and "grad_scaler" in checkpoints:
diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py b/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py
index 87fa3b5eb..d1cf90ffb 100644
--- a/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py
+++ b/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
-# Copyright (c)  2021  University of Chinese Academy of Sciences (author: Han Zhu)
+# Copyright    2022-2023  Xiaomi Corp.        (authors: Daniel Povey,
+#                                                       Zengwei Yao,
+#                                                       Xiaoyu Yang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -16,28 +18,33 @@
 # limitations under the License.
 
 import copy
+import logging
 import math
+import random
 import warnings
 from typing import List, Optional, Tuple, Union
-import logging
+
 import torch
-import random
 from encoder_interface import EncoderInterface
 from scaling import (
+    Identity,  # more friendly to backward hooks than nn.Identity(), for diagnostic reasons.
+)
+from scaling import (
+    ScaledLinear,  # not as in other dirs.. just scales down initial parameter values.
+)
+from scaling import (
+    ActivationDropoutAndLinear,
     Balancer,
     BiasNorm,
-    Dropout2,
     ChunkCausalDepthwiseConv1d,
-    ActivationDropoutAndLinear,
-    ScaledLinear,  # not as in other dirs.. just scales down initial parameter values.
+    Dropout2,
+    FloatLike,
+    ScheduledFloat,
     Whiten,
-    Identity,  # more friendly to backward hooks than nn.Identity(), for diagnostic reasons.
+    convert_num_channels,
+    limit_param_value,
     penalize_abs_values_gt,
     softmax,
-    ScheduledFloat,
-    FloatLike,
-    limit_param_value,
-    convert_num_channels,
 )
 from torch import Tensor, nn
 
@@ -92,41 +99,43 @@ class Zipformer2(EncoderInterface):
         memory_dim: if supplied and >0, will be the dimension of the memory embeddings
             passed into the zipformer (e.g. this might be the output of another
             Zipformer used to create embedding vectors.)
-        memory_dropout_rate: By this probability, do not use the provided memory for 
+        memory_dropout_rate: By this probability, do not use the provided memory for
             cross-attention. This should give robustness to the model when evaluated
             without memory.
+        memory_layer: if supplied and >0, only add cross-attention module starting from
+            the specified layer.
     """
+
     def __init__(
-            self,
-            output_downsampling_factor: int = 2,
-            downsampling_factor: Tuple[int] = (2, 4),
-            encoder_dim: Union[int, Tuple[int]] = 384,
-            num_encoder_layers: Union[int, Tuple[int]] = 4,
-            encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
-            query_head_dim: Union[int, Tuple[int]]  = 24,
-            pos_head_dim: Union[int, Tuple[int]]  = 4,
-            value_head_dim: Union[int, Tuple[int]] = 12,
-            num_heads: Union[int, Tuple[int]] = 8,
-            feedforward_dim: Union[int, Tuple[int]] = 1536,
-            cnn_module_kernel: Union[int, Tuple[int]] = 31,
-            memory_dim: int = -1,
-            memory_dropout_rate: float = 0.05,
-            memory_layer: int = 0,
-            pos_dim: int = 192,
-            dropout: FloatLike = None,  # see code below for default
-            warmup_batches: float = 4000.0,
-            causal: bool = False,
-            chunk_size: Tuple[int] = (-1,),
-            left_context_frames: Tuple[int] = (-1,),
+        self,
+        output_downsampling_factor: int = 2,
+        downsampling_factor: Tuple[int] = (2, 4),
+        encoder_dim: Union[int, Tuple[int]] = 384,
+        num_encoder_layers: Union[int, Tuple[int]] = 4,
+        encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
+        query_head_dim: Union[int, Tuple[int]] = 24,
+        pos_head_dim: Union[int, Tuple[int]] = 4,
+        value_head_dim: Union[int, Tuple[int]] = 12,
+        num_heads: Union[int, Tuple[int]] = 8,
+        feedforward_dim: Union[int, Tuple[int]] = 1536,
+        cnn_module_kernel: Union[int, Tuple[int]] = 31,
+        pos_dim: int = 192,
+        dropout: FloatLike = None,  # see code below for default
+        warmup_batches: float = 4000.0,
+        causal: bool = False,
+        chunk_size: Tuple[int] = [-1],
+        left_context_frames: Tuple[int] = [-1],
+        memory_dim: int = -1,
+        memory_dropout_rate: float = 0.05,
+        memory_layer: int = 0,
     ) -> None:
         super(Zipformer2, self).__init__()
 
         if dropout is None:
-            dropout = ScheduledFloat((0.0, 0.3),
-                                     (20000.0, 0.1))
+            dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
 
         def _to_tuple(x):
-            """ Converts a single int or a 1-tuple of an int to a tuple with the same length
+            """Converts a single int or a 1-tuple of an int to a tuple with the same length
             as downsampling_factor"""
             if isinstance(x, int):
                 x = (x,)
@@ -136,15 +145,18 @@ class Zipformer2(EncoderInterface):
                 assert len(x) == len(downsampling_factor) and isinstance(x[0], int)
             return x
 
-        self.output_downsampling_factor = output_downsampling_factor # int
-        self.downsampling_factor = downsampling_factor # tuple
-        self.encoder_dim = encoder_dim = _to_tuple(encoder_dim) # tuple
-        self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(encoder_unmasked_dim) # tuple
+        self.output_downsampling_factor = output_downsampling_factor  # int
+        self.downsampling_factor = downsampling_factor  # tuple
+        self.encoder_dim = encoder_dim = _to_tuple(encoder_dim)  # tuple
+        self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(
+            encoder_unmasked_dim
+        )  # tuple
         num_encoder_layers = _to_tuple(num_encoder_layers)
-        query_head_dim = _to_tuple(query_head_dim)
-        value_head_dim = _to_tuple(value_head_dim)
+        self.num_encoder_layers = num_encoder_layers
+        self.query_head_dim = query_head_dim = _to_tuple(query_head_dim)
+        self.value_head_dim = value_head_dim = _to_tuple(value_head_dim)
         pos_head_dim = _to_tuple(pos_head_dim)
-        num_heads = _to_tuple(num_heads)
+        self.num_heads = num_heads = _to_tuple(num_heads)
         feedforward_dim = _to_tuple(feedforward_dim)
         self.cnn_module_kernel = cnn_module_kernel = _to_tuple(cnn_module_kernel)
 
@@ -154,7 +166,7 @@ class Zipformer2(EncoderInterface):
         self.memory_dropout_rate = memory_dropout_rate
         self.memory_layer = memory_layer
 
-        for u,d in zip(encoder_unmasked_dim, encoder_dim):
+        for u, d in zip(encoder_unmasked_dim, encoder_dim):
             assert u <= d
 
         # each one will be Zipformer2Encoder or DownsampledZipformer2Encoder
@@ -162,7 +174,6 @@ class Zipformer2(EncoderInterface):
 
         num_encoders = len(downsampling_factor)
         for i in range(num_encoders):
-
             encoder_layer = Zipformer2EncoderLayer(
                 embed_dim=encoder_dim[i],
                 pos_dim=pos_dim,
@@ -201,13 +212,11 @@ class Zipformer2(EncoderInterface):
 
         self.encoders = nn.ModuleList(encoders)
 
-        self.downsample_output = SimpleDownsample(max(encoder_dim),
-                                                  downsample=output_downsampling_factor,
-                                                  dropout=dropout)
+        self.downsample_output = SimpleDownsample(
+            max(encoder_dim), downsample=output_downsampling_factor, dropout=dropout
+        )
 
-    def get_feature_masks(
-            self,
-            x: torch.Tensor) -> List[Union[float, Tensor]]:
+    def get_feature_masks(self, x: Tensor) -> Union[List[float], List[Tensor]]:
         """
         In eval mode, returns [1.0] * num_encoders; in training mode, returns a number of
         randomized feature masks, one per encoder.
@@ -225,25 +234,30 @@ class Zipformer2(EncoderInterface):
         """
         num_encoders = len(self.encoder_dim)
         if not self.training:
-            return [ 1.0 ] * num_encoders
+            return [1.0] * num_encoders
 
         (num_frames0, batch_size, _encoder_dims0) = x.shape
 
-        assert self.encoder_dim[0] == _encoder_dims0
+        assert self.encoder_dim[0] == _encoder_dims0, (
+            self.encoder_dim[0],
+            _encoder_dims0,
+        )
 
         feature_mask_dropout_prob = 0.125
 
         # mask1 shape: (1, batch_size, 1)
-        mask1 = (torch.rand(1, batch_size, 1,
-                            device=x.device) >
-                 feature_mask_dropout_prob).to(x.dtype)
+        mask1 = (
+            torch.rand(1, batch_size, 1, device=x.device) > feature_mask_dropout_prob
+        ).to(x.dtype)
 
         # mask2 has additional sequences masked, about twice the number.
-        mask2 = torch.logical_and(mask1,
-                                  (torch.rand(1, batch_size, 1,
-                                              device=x.device) >
-                                   feature_mask_dropout_prob).to(x.dtype))
-
+        mask2 = torch.logical_and(
+            mask1,
+            (
+                torch.rand(1, batch_size, 1, device=x.device)
+                > feature_mask_dropout_prob
+            ).to(x.dtype),
+        )
 
         # dim: (1, batch_size, 2)
         mask = torch.cat((mask1, mask2), dim=-1)
@@ -251,8 +265,9 @@ class Zipformer2(EncoderInterface):
         feature_masks = []
         for i in range(num_encoders):
             channels = self.encoder_dim[i]
-            feature_mask = torch.ones(1, batch_size, channels,
-                                       dtype=x.dtype, device=x.device)
+            feature_mask = torch.ones(
+                1, batch_size, channels, dtype=x.dtype, device=x.device
+            )
             u1 = self.encoder_unmasked_dim[i]
             u2 = u1 + (channels - u1) // 2
 
@@ -263,24 +278,32 @@ class Zipformer2(EncoderInterface):
 
         return feature_masks
 
-
     def get_chunk_info(self) -> Tuple[int, int]:
         """
         Returns chunk_size and left_context_chunks.
         """
         if not self.causal:
             return -1, -1
-        chunk_size = random.choice(self.chunk_size)
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            assert len(self.chunk_size) == 1, self.chunk_size
+            chunk_size = self.chunk_size[0]
+        else:
+            chunk_size = random.choice(self.chunk_size)
         if chunk_size == -1:
             left_context_chunks = -1
         else:
-            left_context_frames = random.choice(self.left_context_frames)
+            if torch.jit.is_scripting() or torch.jit.is_tracing():
+                assert len(self.left_context_frames) == 1, self.left_context_frames
+                left_context_frames = self.left_context_frames[0]
+            else:
+                left_context_frames = random.choice(self.left_context_frames)
             # Note: in Python, -1 // n == -1 for n > 0
             left_context_chunks = left_context_frames // chunk_size
             if left_context_chunks == 0:
                 left_context_chunks = 1
-        return chunk_size, left_context_chunks
 
+        return chunk_size, left_context_chunks
 
     def forward(
         self,
@@ -312,64 +335,68 @@ class Zipformer2(EncoderInterface):
               of frames in `embeddings` before padding.
         """
         outputs = []
-        feature_masks = self.get_feature_masks(x)
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            feature_masks = [1.0] * len(self.encoder_dim)
+        else:
+            feature_masks = self.get_feature_masks(x)
 
         chunk_size, left_context_chunks = self.get_chunk_info()
 
-        attn_mask = self._get_attn_mask(x, chunk_size, left_context_chunks)
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            # Not support exporting a model for simulating streaming decoding
+            attn_mask = None
+        else:
+            attn_mask = self._get_attn_mask(x, chunk_size, left_context_chunks)
 
         if self.training and memory is not None:
             batch_size = x.shape[1]
             # setting memory to zero should be equivalent to not using the
             # memory input at all, since the Attention module has no biases.
-            memory = memory * (torch.rand(batch_size, 1, device=memory.device) >
-                               self.memory_dropout_rate)
+            memory = memory * (
+                torch.rand(batch_size, 1, device=memory.device)
+                > self.memory_dropout_rate
+            )
 
         for i, module in enumerate(self.encoders):
             ds = self.downsampling_factor[i]
             x = convert_num_channels(x, self.encoder_dim[i])
 
-            x = module(x,
-                       chunk_size=chunk_size,
-                       feature_mask=feature_masks[i],
-                       src_key_padding_mask=(None if src_key_padding_mask is None
-                                             else src_key_padding_mask[...,::ds]),
-                       attn_mask=attn_mask,
-                       memory=memory if i >= self.memory_layer else None,
-                       memory_key_padding_mask=memory_key_padding_mask if i >= self.memory_layer else None,
+            x = module(
+                x,
+                chunk_size=chunk_size,
+                feature_mask=feature_masks[i],
+                src_key_padding_mask=(
+                    None
+                    if src_key_padding_mask is None
+                    else src_key_padding_mask[..., ::ds]
+                ),
+                attn_mask=attn_mask,
+                memory=memory if i >= self.memory_layer else None,
+                memory_key_padding_mask=memory_key_padding_mask
+                if i >= self.memory_layer
+                else None,
             )
             outputs.append(x)
 
-        def get_full_dim_output():
-            num_encoders = len(self.encoder_dim)
-            assert len(outputs) == num_encoders
-            output_dim = max(self.encoder_dim)
-            output_pieces = [ outputs[-1] ]
-            cur_dim = self.encoder_dim[-1]
-            for i in range(num_encoders - 2, -1, -1):
-                d = self.encoder_dim[i]
-                if d > cur_dim:
-                    this_output = outputs[i]
-                    output_pieces.append(this_output[..., cur_dim:d])
-                    cur_dim = d
-            assert cur_dim == output_dim
-            return torch.cat(output_pieces, dim=-1)
-
         # if the last output has the largest dimension, x will be unchanged,
         # it will be the same as outputs[-1].  Otherwise it will be concatenated
         # from different pieces of 'outputs', taking each dimension from the
         # most recent output that has it present.
-        x = get_full_dim_output()
+        x = self._get_full_dim_output(outputs)
         x = self.downsample_output(x)
-
-        d = self.output_downsampling_factor
-        lengths = (x_lens + d - 1) // d
+        # class Downsample has this rounding behavior..
+        assert self.output_downsampling_factor == 2, self.output_downsampling_factor
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            lengths = (x_lens + 1) // 2
+        else:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                lengths = (x_lens + 1) // 2
 
         return x, lengths
 
-    def _get_attn_mask(self, x: Tensor,
-                       chunk_size: int,
-                       left_context_chunks: int
+    def _get_attn_mask(
+        self, x: Tensor, chunk_size: int, left_context_chunks: int
     ) -> Optional[Tensor]:
         """
         Return None if chunk_size == -1, else return attention mask of shape
@@ -384,9 +411,11 @@ class Zipformer2(EncoderInterface):
         assert all(chunk_size % d == 0 for d in self.downsampling_factor)
         if left_context_chunks >= 0:
             num_encoders = len(self.encoder_dim)
-            assert all (chunk_size * left_context_chunks >=
-                        (self.cnn_module_kernel[i] // 2) * self.downsampling_factor[i]
-                        for i in range(num_encoders))
+            assert all(
+                chunk_size * left_context_chunks
+                >= (self.cnn_module_kernel[i] // 2) * self.downsampling_factor[i]
+                for i in range(num_encoders)
+            )
         else:
             left_context_chunks = 1000000
 
@@ -395,27 +424,159 @@ class Zipformer2(EncoderInterface):
         # t is frame index, shape (seq_len,)
         t = torch.arange(seq_len, dtype=torch.int32, device=x.device)
         # c is chunk index for each frame, shape (seq_len,)
-        c = t // chunk_size
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            c = t // chunk_size
+        else:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                c = t // chunk_size
         src_c = c
         tgt_c = c.unsqueeze(-1)
 
-        attn_mask = torch.logical_or(src_c > tgt_c,
-                                     src_c < tgt_c - left_context_chunks)
+        attn_mask = torch.logical_or(src_c > tgt_c, src_c < tgt_c - left_context_chunks)
         if __name__ == "__main__":
             logging.info(f"attn_mask = {attn_mask}")
         return attn_mask
 
+    def _get_full_dim_output(self, outputs: List[Tensor]):
+        num_encoders = len(self.encoder_dim)
+        assert len(outputs) == num_encoders
+        output_dim = max(self.encoder_dim)
+        output_pieces = [outputs[-1]]
+        cur_dim = self.encoder_dim[-1]
+        for i in range(num_encoders - 2, -1, -1):
+            d = self.encoder_dim[i]
+            if d > cur_dim:
+                this_output = outputs[i]
+                output_pieces.append(this_output[..., cur_dim:d])
+                cur_dim = d
+        assert cur_dim == output_dim
+        return torch.cat(output_pieces, dim=-1)
+
+    def streaming_forward(
+        self,
+        x: Tensor,
+        x_lens: Tensor,
+        states: List[Tensor],
+        src_key_padding_mask: Tensor,
+    ) -> Tuple[Tensor, Tensor, List[Tensor]]:
+        """
+        Args:
+          x:
+            The input tensor. Its shape is (seq_len, batch_size, feature_dim).
+          x_lens:
+            A tensor of shape (batch_size,) containing the number of frames in
+            `x` before padding.
+          states: list of cached tensors of all encoder layers. For layer-i,
+            states[i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1, cached_val2,
+            cached_conv1, cached_conv2).
+          src_key_padding_mask:
+            The mask for padding, of shape (batch_size, seq_len); True means
+            masked position. May be None.
+        Returns:
+          Return a tuple containing 2 tensors:
+            - embeddings: its shape is (output_seq_len, batch_size, max(encoder_dim))
+            - lengths, a tensor of shape (batch_size,) containing the number
+              of frames in `embeddings` before padding.
+            - updated states
+        """
+        outputs = []
+        new_states = []
+        layer_offset = 0
+
+        for i, module in enumerate(self.encoders):
+            num_layers = module.num_layers
+            ds = self.downsampling_factor[i]
+            x = convert_num_channels(x, self.encoder_dim[i])
+
+            x, new_layer_states = module.streaming_forward(
+                x,
+                states=states[layer_offset * 6 : (layer_offset + num_layers) * 6],
+                left_context_len=self.left_context_frames[0] // ds,
+                src_key_padding_mask=src_key_padding_mask[..., ::ds],
+            )
+            layer_offset += num_layers
+            outputs.append(x)
+            new_states += new_layer_states
+
+        # if the last output has the largest dimension, x will be unchanged,
+        # it will be the same as outputs[-1].  Otherwise it will be concatenated
+        # from different pieces of 'outputs', taking each dimension from the
+        # most recent output that has it present.
+        x = self._get_full_dim_output(outputs)
+        x = self.downsample_output(x)
+        # class Downsample has this rounding behavior..
+        assert self.output_downsampling_factor == 2
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            lengths = (x_lens + 1) // 2
+        else:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                lengths = (x_lens + 1) // 2
+
+        return x, lengths, new_states
+
+    @torch.jit.export
+    def get_init_states(
+        self,
+        batch_size: int = 1,
+        device: torch.device = torch.device("cpu"),
+    ) -> List[Tensor]:
+        """Get initial states.
+
+        A list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
+        is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
+        """
+        states = []
+        for i, module in enumerate(self.encoders):
+            num_layers = module.num_layers
+            embed_dim = self.encoder_dim[i]
+            ds = self.downsampling_factor[i]
+            num_heads = self.num_heads[i]
+            key_dim = self.query_head_dim[i] * num_heads
+            value_dim = self.value_head_dim[i] * num_heads
+            downsample_left = self.left_context_frames[0] // ds
+            nonlin_attn_head_dim = 3 * embed_dim // 4
+            conv_left_pad = self.cnn_module_kernel[i] // 2
+            for layer in range(num_layers):
+                cached_key = torch.zeros(downsample_left, batch_size, key_dim).to(
+                    device
+                )
+                cached_nonlin_attn = torch.zeros(
+                    1, batch_size, downsample_left, nonlin_attn_head_dim
+                ).to(device)
+                cached_val1 = torch.zeros(downsample_left, batch_size, value_dim).to(
+                    device
+                )
+                cached_val2 = torch.zeros(downsample_left, batch_size, value_dim).to(
+                    device
+                )
+                cached_conv1 = torch.zeros(batch_size, embed_dim, conv_left_pad).to(
+                    device
+                )
+                cached_conv2 = torch.zeros(batch_size, embed_dim, conv_left_pad).to(
+                    device
+                )
+                states += [
+                    cached_key,
+                    cached_nonlin_attn,
+                    cached_val1,
+                    cached_val2,
+                    cached_conv1,
+                    cached_conv2,
+                ]
+
+        return states
+
 
 def _whitening_schedule(x: float, ratio: float = 2.0) -> ScheduledFloat:
-    return ScheduledFloat((0.0, x),
-                          (20000.0, ratio * x),
-                          default=x)
+    return ScheduledFloat((0.0, x), (20000.0, ratio * x), default=x)
+
 
 def _balancer_schedule(min_prob: float):
     return ScheduledFloat((0.0, 0.4), (8000.0, min_prob))
 
 
-
 class Zipformer2EncoderLayer(nn.Module):
     """
     Args:
@@ -431,35 +592,48 @@ class Zipformer2EncoderLayer(nn.Module):
         >>> pos_emb = torch.rand(32, 19, 512)
         >>> out = encoder_layer(src, pos_emb)
     """
+
     def __init__(
-            self,
-            embed_dim: int,
-            pos_dim: int,
-            num_heads: int,
-            query_head_dim: int,
-            pos_head_dim: int,
-            value_head_dim: int,
-            feedforward_dim: int,
-            dropout: FloatLike = 0.1,
-            cnn_module_kernel: int = 31,
-            causal: bool = False,
-            memory_dim: int = -1,
-            attention_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0),
-            conv_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0),
-            const_attention_rate: FloatLike = ScheduledFloat((0.0, 0.25), (4000.0, 0.025), default=0),
-            ff2_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)),
-            ff3_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)),
-            bypass_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), (4000.0, 0.02), default=0),
+        self,
+        embed_dim: int,
+        pos_dim: int,
+        num_heads: int,
+        query_head_dim: int,
+        pos_head_dim: int,
+        value_head_dim: int,
+        feedforward_dim: int,
+        dropout: FloatLike = 0.1,
+        cnn_module_kernel: int = 31,
+        causal: bool = False,
+        memory_dim: int = -1,
+        attention_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0
+        ),
+        conv_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0
+        ),
+        const_attention_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.25), (4000.0, 0.025), default=0
+        ),
+        ff2_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)
+        ),
+        ff3_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.1), (4000.0, 0.01), (50000.0, 0.0)
+        ),
+        bypass_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.5), (4000.0, 0.02), default=0
+        ),
     ) -> None:
         super(Zipformer2EncoderLayer, self).__init__()
         self.embed_dim = embed_dim
 
         # self.bypass implements layer skipping as well as bypass; see its default values.
-        self.bypass = BypassModule(embed_dim, skip_rate=bypass_skip_rate,
-                                   straight_through_rate=0.025)
+        self.bypass = BypassModule(
+            embed_dim, skip_rate=bypass_skip_rate, straight_through_rate=0
+        )
         # bypass_mid is bypass used in the middle of the layer.
-        self.bypass_mid = BypassModule(embed_dim, straight_through_rate=0.025)
-
+        self.bypass_mid = BypassModule(embed_dim, straight_through_rate=0)
 
         # skip probability for dynamic modules (meaning: anything but feedforward).
         self.attention_skip_rate = copy.deepcopy(attention_skip_rate)
@@ -475,76 +649,77 @@ class Zipformer2EncoderLayer(nn.Module):
         self.const_attention_rate = copy.deepcopy(const_attention_rate)
 
         self.self_attn_weights = RelPositionMultiheadAttentionWeights(
-            embed_dim, pos_dim=pos_dim, num_heads=num_heads,
-            query_head_dim=query_head_dim, pos_head_dim=pos_head_dim,
+            embed_dim,
+            pos_dim=pos_dim,
+            num_heads=num_heads,
+            query_head_dim=query_head_dim,
+            pos_head_dim=pos_head_dim,
             dropout=0.0,
         )
 
+        self.self_attn1 = Attention(embed_dim, embed_dim, num_heads, value_head_dim)
 
-        self.self_attn1 = Attention(embed_dim, embed_dim, num_heads,
-                                        value_head_dim)
-
-        self.self_attn2 = Attention(embed_dim, embed_dim, num_heads,
-                                    value_head_dim)
+        self.self_attn2 = Attention(embed_dim, embed_dim, num_heads, value_head_dim)
 
         if memory_dim > 0:
             self.attn_weights = MultiheadAttentionWeights(
-                memory_dim, embed_dim,
+                memory_dim,
+                embed_dim,
                 num_heads=num_heads,
                 head_dim=query_head_dim,
                 dropout=0.0,
             )
-            self.src_attn1 = Attention(memory_dim, embed_dim, num_heads,
-                                       value_head_dim)
-            self.src_attn2 = Attention(memory_dim, embed_dim, num_heads,
-                                       value_head_dim)
+            self.src_attn1 = Attention(memory_dim, embed_dim, num_heads, value_head_dim)
+            self.src_attn2 = Attention(memory_dim, embed_dim, num_heads, value_head_dim)
             self.memory_balancer = Balancer(
                 embed_dim,
                 channel_dim=-1,
                 min_abs=0.015,
             )
 
+        self.feed_forward1 = FeedforwardModule(
+            embed_dim, (feedforward_dim * 3) // 4, dropout
+        )
 
-        self.feed_forward1 = FeedforwardModule(embed_dim,
-                                               (feedforward_dim * 3) // 4,
-                                               dropout)
+        self.feed_forward2 = FeedforwardModule(embed_dim, feedforward_dim, dropout)
 
-        self.feed_forward2 = FeedforwardModule(embed_dim,
-                                               feedforward_dim,
-                                               dropout)
+        self.feed_forward3 = FeedforwardModule(
+            embed_dim, (feedforward_dim * 5) // 4, dropout
+        )
 
-        self.feed_forward3 = FeedforwardModule(embed_dim,
-                                               (feedforward_dim * 5) // 4,
-                                               dropout)
+        self.nonlin_attention = NonlinAttention(
+            embed_dim, hidden_channels=3 * embed_dim // 4
+        )
 
-        self.nonlin_attention = NonlinAttention(embed_dim,
-                                                hidden_channels=3 * embed_dim // 4)
+        self.conv_module1 = ConvolutionModule(
+            embed_dim, cnn_module_kernel, causal=causal
+        )
 
-        self.conv_module1 = ConvolutionModule(embed_dim,
-                                             cnn_module_kernel,
-                                             causal=causal)
+        self.conv_module2 = ConvolutionModule(
+            embed_dim, cnn_module_kernel, causal=causal
+        )
 
-        self.conv_module2 = ConvolutionModule(embed_dim,
-                                              cnn_module_kernel,
-                                              causal=causal)
-
-
-        #self.attention_squeeze = AttentionSqueeze(embed_dim, embed_dim // 2)
+        # self.attention_squeeze = AttentionSqueeze(embed_dim, embed_dim // 2)
 
         self.norm = BiasNorm(embed_dim)
 
         self.bypass_scale = nn.Parameter(torch.full((embed_dim,), 0.5))
 
         self.balancer1 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.45, max_positive=0.55,
-            min_abs=0.2, max_abs=4.0,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.45,
+            max_positive=0.55,
+            min_abs=0.2,
+            max_abs=4.0,
         )
 
         # balancer for output of NonlinAttentionModule
         self.balancer_na = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.3, max_positive=0.7,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=0.7,
             min_abs=ScheduledFloat((0.0, 0.004), (4000.0, 0.02)),
             prob=0.05,  # out of concern for memory usage
         )
@@ -553,32 +728,40 @@ class Zipformer2EncoderLayer(nn.Module):
         # small.  give this a very small probability, even at the start of
         # training, it's to fix a rare problem and it's OK to fix it slowly.
         self.balancer_ff2 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.3, max_positive=0.7,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=0.7,
             min_abs=ScheduledFloat((0.0, 0.0), (4000.0, 0.1), default=0.0),
             max_abs=2.0,
             prob=0.05,
         )
 
         self.balancer_ff3 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.3, max_positive=0.7,
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=0.7,
             min_abs=ScheduledFloat((0.0, 0.0), (4000.0, 0.2), default=0.0),
             max_abs=4.0,
             prob=0.05,
         )
 
-        self.whiten = Whiten(num_groups=1,
-                             whitening_limit=_whitening_schedule(4.0, ratio=3.0),
-                             prob=(0.025, 0.25),
-                             grad_scale=0.01)
-
-        self.balancer2 = Balancer(
-            embed_dim, channel_dim=-1,
-            min_positive=0.45, max_positive=0.55,
-            min_abs=0.1, max_abs=4.0,
+        self.whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(4.0, ratio=3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
         )
 
+        self.balancer2 = Balancer(
+            embed_dim,
+            channel_dim=-1,
+            min_positive=0.45,
+            max_positive=0.55,
+            min_abs=0.1,
+            max_abs=4.0,
+        )
 
     def get_bypass_scale(self, batch_size: int):
         # returns bypass-scale of shape (num_channels,),
@@ -588,9 +771,11 @@ class Zipformer2EncoderLayer(nn.Module):
         if torch.jit.is_scripting() or not self.training:
             return self.bypass_scale
         else:
-            ans = limit_param_value(self.bypass_scale,
-                                    min=float(self.bypass_min),
-                                    max=float(self.bypass_max))
+            ans = limit_param_value(
+                self.bypass_scale,
+                min=float(self.bypass_min),
+                max=float(self.bypass_max),
+            )
             layer_skip_rate = float(self.layer_skip_rate)
             if layer_skip_rate != 0.0:
                 mask = torch.rand((batch_size, 1), device=ans.device) > layer_skip_rate
@@ -599,14 +784,15 @@ class Zipformer2EncoderLayer(nn.Module):
                 # on which we have randomly chosen to do layer-skipping.
             return ans
 
-    def get_sequence_dropout_mask(self, x: Tensor, dropout_rate: float) -> Optional[Tensor]:
+    def get_sequence_dropout_mask(
+        self, x: Tensor, dropout_rate: float
+    ) -> Optional[Tensor]:
         if dropout_rate == 0.0 or not self.training or torch.jit.is_scripting():
             return None
         batch_size = x.shape[1]
         mask = (torch.rand(batch_size, 1, device=x.device) > dropout_rate).to(x.dtype)
         return mask
 
-
     def sequence_dropout(self, x: Tensor, dropout_rate: float) -> Tensor:
         """
         Apply sequence-level dropout to x.
@@ -618,7 +804,6 @@ class Zipformer2EncoderLayer(nn.Module):
         else:
             return x * dropout_mask
 
-
     def forward(
         self,
         src: Tensor,
@@ -630,21 +815,21 @@ class Zipformer2EncoderLayer(nn.Module):
         memory_key_padding_mask: Optional[Tensor] = None,
     ) -> Tensor:
         """
-        Pass the input through the encoder layer.
-        Args:
-            src: the sequence to the encoder (required): shape (seq_len, batch_size, embedding_dim).
-         pos_emb: (1, 2*seq_len-1, pos_emb_dim) or (batch_size, 2*seq_len-1, pos_emb_dim)
-         chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking.
-       feature_mask: something that broadcasts with src, that we'll multiply `src`
-              by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
-         attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
-                interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
-               True means masked position. May be None.
-    src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
-             masked position.  May be None.
+            Pass the input through the encoder layer.
+            Args:
+                src: the sequence to the encoder (required): shape (seq_len, batch_size, embedding_dim).
+             pos_emb: (1, 2*seq_len-1, pos_emb_dim) or (batch_size, 2*seq_len-1, pos_emb_dim)
+             chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking.
+           feature_mask: something that broadcasts with src, that we'll multiply `src`
+                  by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
+             attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
+                    interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
+                   True means masked position. May be None.
+        src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
+                 masked position.  May be None.
 
-        Returns:
-           A tensor which has the same shape as src
+            Returns:
+               A tensor which has the same shape as src
         """
         src_orig = src
 
@@ -659,7 +844,7 @@ class Zipformer2EncoderLayer(nn.Module):
             key_padding_mask=src_key_padding_mask,
         )
 
-        if memory is not None and hasattr(self, 'attn_weights'):
+        if memory is not None and hasattr(self, "attn_weights"):
             src_attn_weights = self.attn_weights(memory, src, memory_key_padding_mask)
 
         src = src + self.feed_forward1(src)
@@ -674,50 +859,66 @@ class Zipformer2EncoderLayer(nn.Module):
                 # averaging-over-time operation.
                 # only need the mask, can just use the 1st one and expand later
                 selected_attn_weights = selected_attn_weights[0:1]
-                selected_attn_weights = (selected_attn_weights > 0.0).to(selected_attn_weights.dtype)
-                selected_attn_weights = selected_attn_weights * (1.0 / selected_attn_weights.sum(dim=-1, keepdim=True))
+                selected_attn_weights = (selected_attn_weights > 0.0).to(
+                    selected_attn_weights.dtype
+                )
+                selected_attn_weights = selected_attn_weights * (
+                    1.0 / selected_attn_weights.sum(dim=-1, keepdim=True)
+                )
                 selected_attn_weights = selected_attn_weights.expand(2, -1, -1, -1)
 
-
-        na = self.balancer_na(self.nonlin_attention(src,
-                                                    selected_attn_weights[0:1]))
+        na = self.balancer_na(self.nonlin_attention(src, selected_attn_weights[0:1]))
 
         src = src + (na if attn_dropout_mask is None else na * attn_dropout_mask)
 
-        self_attn = self.self_attn1(
-            src, attn_weights)
+        self_attn = self.self_attn1(src, attn_weights)
 
-        src = src + (self_attn if attn_dropout_mask is None else self_attn * attn_dropout_mask)
+        src = src + (
+            self_attn if attn_dropout_mask is None else self_attn * attn_dropout_mask
+        )
 
-        if memory is not None and hasattr(self, 'attn_weights'):
-            src = src + self.sequence_dropout(self.memory_balancer(self.src_attn1(memory, src_attn_weights)),
-                                              attention_skip_rate)
+        if memory is not None and hasattr(self, "attn_weights"):
+            src = src + self.sequence_dropout(
+                self.memory_balancer(self.src_attn1(memory, src_attn_weights)),
+                attention_skip_rate,
+            )
 
-        src = src + self.sequence_dropout(self.conv_module1(src, chunk_size=chunk_size,
-                                                            src_key_padding_mask=src_key_padding_mask),
-                                          float(self.conv_skip_rate))
+        src = src + self.sequence_dropout(
+            self.conv_module1(
+                src, chunk_size=chunk_size, src_key_padding_mask=src_key_padding_mask
+            ),
+            float(self.conv_skip_rate),
+        )
 
-        src = src + self.sequence_dropout(self.balancer_ff2(self.feed_forward2(src)),
-                                          float(self.ff2_skip_rate))
+        src = src + self.sequence_dropout(
+            self.balancer_ff2(self.feed_forward2(src)), float(self.ff2_skip_rate)
+        )
 
         # bypass in the middle of the layer.
         src = self.bypass_mid(src_orig, src)
 
-        self_attn = self.self_attn2(
-            src, attn_weights)
+        self_attn = self.self_attn2(src, attn_weights)
 
-        src = src + (self_attn if attn_dropout_mask is None else self_attn * attn_dropout_mask)
+        src = src + (
+            self_attn if attn_dropout_mask is None else self_attn * attn_dropout_mask
+        )
 
-        if memory is not None and hasattr(self, 'attn_weights'):
-            src = src + self.sequence_dropout(self.memory_balancer(self.src_attn2(memory, src_attn_weights)),
-                                              attention_skip_rate)
+        if memory is not None and hasattr(self, "attn_weights"):
+            src = src + self.sequence_dropout(
+                self.memory_balancer(self.src_attn2(memory, src_attn_weights)),
+                attention_skip_rate,
+            )
 
-        src = src + self.sequence_dropout(self.conv_module2(src, chunk_size=chunk_size,
-                                                            src_key_padding_mask=src_key_padding_mask),
-                                          float(self.conv_skip_rate))
+        src = src + self.sequence_dropout(
+            self.conv_module2(
+                src, chunk_size=chunk_size, src_key_padding_mask=src_key_padding_mask
+            ),
+            float(self.conv_skip_rate),
+        )
 
-        src = src + self.sequence_dropout(self.balancer_ff3(self.feed_forward3(src)),
-                                          float(self.ff3_skip_rate))
+        src = src + self.sequence_dropout(
+            self.balancer_ff3(self.feed_forward3(src)), float(self.ff3_skip_rate)
+        )
 
         src = self.balancer1(src)
         src = self.norm(src)
@@ -729,6 +930,7 @@ class Zipformer2EncoderLayer(nn.Module):
 
         return src
 
+
 class Zipformer2Encoder(nn.Module):
     r"""Zipformer2Encoder is a stack of N encoder layers
 
@@ -743,20 +945,22 @@ class Zipformer2Encoder(nn.Module):
         >>> src = torch.rand(10, 32, 512)
         >>> out = zipformer_encoder(src)
     """
+
     def __init__(
-            self,
-            encoder_layer: nn.Module,
-            num_layers: int,
-            pos_dim: int,
-            dropout: float,
-            warmup_begin: float,
-            warmup_end: float,
-            initial_layerdrop_rate: float = 0.5,
-            final_layerdrop_rate: float = 0.05,
+        self,
+        encoder_layer: nn.Module,
+        num_layers: int,
+        pos_dim: int,
+        dropout: float,
+        warmup_begin: float,
+        warmup_end: float,
+        initial_layerdrop_rate: float = 0.5,
+        final_layerdrop_rate: float = 0.05,
     ) -> None:
         super().__init__()
-        self.encoder_pos = CompactRelPositionalEncoding(pos_dim, dropout_rate=0.15,
-                                                        length_factor=1.0)
+        self.encoder_pos = CompactRelPositionalEncoding(
+            pos_dim, dropout_rate=0.15, length_factor=1.0
+        )
 
         self.layers = nn.ModuleList(
             [copy.deepcopy(encoder_layer) for i in range(num_layers)]
@@ -765,13 +969,15 @@ class Zipformer2Encoder(nn.Module):
 
         assert 0 <= warmup_begin <= warmup_end
 
-        delta = (1. / num_layers) * (warmup_end - warmup_begin)
+        delta = (1.0 / num_layers) * (warmup_end - warmup_begin)
         cur_begin = warmup_begin  # interpreted as a training batch index
         for i in range(num_layers):
             cur_end = cur_begin + delta
-            self.layers[i].bypass.skip_rate = ScheduledFloat((cur_begin, initial_layerdrop_rate),
-                                                             (cur_end, final_layerdrop_rate),
-                                                             default=0.0)
+            self.layers[i].bypass.skip_rate = ScheduledFloat(
+                (cur_begin, initial_layerdrop_rate),
+                (cur_end, final_layerdrop_rate),
+                default=0.0,
+            )
             cur_begin = cur_end
 
     def forward(
@@ -806,8 +1012,6 @@ class Zipformer2Encoder(nn.Module):
         pos_emb = self.encoder_pos(src)
         output = src
 
-        rnd_seed = src.numel() + random.randint(0, 1000)
-
         output = output * feature_mask
 
         for i, mod in enumerate(self.layers):
@@ -833,13 +1037,15 @@ class BypassModule(nn.Module):
     "straight-through", i.e. to not do the bypass operation much initially, in order to
     force all the modules to learn something.
     """
+
     def __init__(
-            self,
-            embed_dim: int,
-            skip_rate: FloatLike = 0.0,
-            straight_through_rate: FloatLike = 0.0,
-            scale_min: FloatLike = ScheduledFloat((0.0, 0.9), (20000.0, 0.2), default=0),
-            scale_max: FloatLike = 1.0):
+        self,
+        embed_dim: int,
+        skip_rate: FloatLike = 0.0,
+        straight_through_rate: FloatLike = 0.0,
+        scale_min: FloatLike = ScheduledFloat((0.0, 0.9), (20000.0, 0.2), default=0),
+        scale_max: FloatLike = 1.0,
+    ):
         super().__init__()
         self.bypass_scale = nn.Parameter(torch.full((embed_dim,), 0.5))
         self.skip_rate = copy.deepcopy(skip_rate)
@@ -847,7 +1053,6 @@ class BypassModule(nn.Module):
         self.scale_min = copy.deepcopy(scale_min)
         self.scale_max = copy.deepcopy(scale_max)
 
-
     def _get_bypass_scale(self, batch_size: int):
         # returns bypass-scale of shape (num_channels,),
         # or (batch_size, num_channels,).  This is actually the
@@ -856,9 +1061,9 @@ class BypassModule(nn.Module):
         if torch.jit.is_scripting() or not self.training:
             return self.bypass_scale
         else:
-            ans = limit_param_value(self.bypass_scale,
-                                    min=float(self.scale_min),
-                                    max=float(self.scale_max))
+            ans = limit_param_value(
+                self.bypass_scale, min=float(self.scale_min), max=float(self.scale_max)
+            )
             skip_rate = float(self.skip_rate)
             if skip_rate != 0.0:
                 mask = torch.rand((batch_size, 1), device=ans.device) > skip_rate
@@ -867,22 +1072,21 @@ class BypassModule(nn.Module):
                 # on which we have randomly chosen to do layer-skipping.
             straight_through_rate = float(self.straight_through_rate)
             if straight_through_rate != 0.0:
-                mask = torch.rand((batch_size, 1), device=ans.device) < straight_through_rate
+                mask = (
+                    torch.rand((batch_size, 1), device=ans.device)
+                    < straight_through_rate
+                )
                 ans = torch.maximum(ans, mask.to(ans.dtype))
 
             return ans
 
-    def forward(self,
-                src_orig: Tensor,
-                src: Tensor):
+    def forward(self, src_orig: Tensor, src: Tensor):
         """
         Args: src_orig and src are both of shape (seq_len, batch_size, num_channels)
         Returns: something with the same shape as src and src_orig
         """
         bypass_scale = self._get_bypass_scale(src.shape[1])
-        return src_orig + (src - src_orig)  * bypass_scale
-
-
+        return src_orig + (src - src_orig) * bypass_scale
 
 
 class DownsampledZipformer2Encoder(nn.Module):
@@ -891,28 +1095,26 @@ class DownsampledZipformer2Encoder(nn.Module):
     after convolutional downsampling, and then upsampled again at the output, and combined
     with the origin input, so that the output has the same shape as the input.
     """
-    def __init__(self,
-                 encoder: nn.Module,
-                 dim: int,
-                 downsample: int,
-                 dropout: FloatLike):
+
+    def __init__(
+        self, encoder: nn.Module, dim: int, downsample: int, dropout: FloatLike
+    ):
         super(DownsampledZipformer2Encoder, self).__init__()
         self.downsample_factor = downsample
-        self.downsample = SimpleDownsample(dim,
-                                           downsample, dropout)
+        self.downsample = SimpleDownsample(dim, downsample, dropout)
         self.encoder = encoder
         self.upsample = SimpleUpsample(dim, downsample)
         self.out_combiner = BypassModule(dim, straight_through_rate=0.025)
 
-
-    def forward(self,
-                src: Tensor,
-                chunk_size: int = -1,
-                feature_mask: Union[Tensor, float] = 1.0,
-                attn_mask: Optional[Tensor] = None,
-                src_key_padding_mask: Optional[Tensor] = None,
-                memory: Optional[Tensor] = None,
-                memory_key_padding_mask: Optional[Tensor] = None,
+    def forward(
+        self,
+        src: Tensor,
+        chunk_size: int = -1,
+        feature_mask: Union[Tensor, float] = 1.0,
+        attn_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        memory: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
     ) -> Tuple[Tensor, Tensor]:
         r"""Downsample, go through encoder, upsample.
 
@@ -936,7 +1138,7 @@ class DownsampledZipformer2Encoder(nn.Module):
         src = self.downsample(src)
         ds = self.downsample_factor
         if attn_mask is not None:
-            attn_mask = attn_mask[::ds,::ds]
+            attn_mask = attn_mask[::ds, ::ds]
 
         src = self.encoder(
             src,
@@ -949,31 +1151,27 @@ class DownsampledZipformer2Encoder(nn.Module):
         )
         src = self.upsample(src)
         # remove any extra frames that are not a multiple of downsample_factor
-        src = src[:src_orig.shape[0]]
+        src = src[: src_orig.shape[0]]
 
         return self.out_combiner(src_orig, src)
 
 
-
 class SimpleDownsample(torch.nn.Module):
     """
     Does downsampling with attention, by weighted sum, and a projection..
     """
-    def __init__(self,
-                 channels: int,
-                 downsample: int,
-                 dropout: FloatLike):
+
+    def __init__(self, channels: int, downsample: int, dropout: FloatLike):
         super(SimpleDownsample, self).__init__()
 
         self.bias = nn.Parameter(torch.zeros(downsample))
 
-        self.name = None # will be set from training code
+        self.name = None  # will be set from training code
         self.dropout = copy.deepcopy(dropout)
 
         self.downsample = downsample
 
-    def forward(self,
-                src: Tensor) -> Tensor:
+    def forward(self, src: Tensor) -> Tensor:
         """
         x: (seq_len, batch_size, in_channels)
         Returns a tensor of shape
@@ -987,7 +1185,7 @@ class SimpleDownsample(torch.nn.Module):
         if seq_len != d_seq_len * ds:
             # right-pad src, repeating the last element.
             pad = d_seq_len * ds - seq_len
-            src_extra = src[src.shape[0]-1:].expand(pad, src.shape[1], src.shape[2])
+            src_extra = src[src.shape[0] - 1 :].expand(pad, src.shape[1], src.shape[2])
             src = torch.cat((src, src_extra), dim=0)
             assert src.shape[0] == d_seq_len * ds
 
@@ -1008,14 +1206,12 @@ class SimpleUpsample(torch.nn.Module):
     A very simple form of upsampling that mostly just repeats the input, but
     also adds a position-specific bias.
     """
-    def __init__(self,
-                 num_channels: int,
-                 upsample: int):
+
+    def __init__(self, num_channels: int, upsample: int):
         super(SimpleUpsample, self).__init__()
         self.upsample = upsample
 
-    def forward(self,
-                src: Tensor) -> Tensor:
+    def forward(self, src: Tensor) -> Tensor:
         """
         x: (seq_len, batch_size, num_channels)
         Returns a tensor of shape
@@ -1053,11 +1249,13 @@ class CompactRelPositionalEncoding(torch.nn.Module):
         length_factor: a heuristic scale (should be >= 1.0) which, if larger, gives
            less weight to small differences of offset near the origin.
     """
+
     def __init__(
-        self, embed_dim: int,
-            dropout_rate: FloatLike,
-            max_len: int = 1000,
-            length_factor: float = 1.0,
+        self,
+        embed_dim: int,
+        dropout_rate: FloatLike,
+        max_len: int = 1000,
+        length_factor: float = 1.0,
     ) -> None:
         """Construct a CompactRelPositionalEncoding object."""
         super(CompactRelPositionalEncoding, self).__init__()
@@ -1069,8 +1267,6 @@ class CompactRelPositionalEncoding(torch.nn.Module):
         self.length_factor = length_factor
         self.extend_pe(torch.tensor(0.0).expand(max_len))
 
-
-
     def extend_pe(self, x: Tensor) -> None:
         """Reset the positional encodings."""
         if self.pe is not None:
@@ -1078,27 +1274,28 @@ class CompactRelPositionalEncoding(torch.nn.Module):
             # the length of self.pe is 2 * input_len - 1
             if self.pe.size(0) >= x.size(0) * 2 - 1:
                 # Note: TorchScript doesn't implement operator== for torch.Device
-                if self.pe.dtype != x.dtype or str(self.pe.device) != str(
-                    x.device
-                ):
+                if self.pe.dtype != x.dtype or str(self.pe.device) != str(x.device):
                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                 return
 
         T = x.size(0)
         # if T == 4, x would contain [ -3, -2, 1, 0, 1, 2, 3 ]
-        x = torch.arange(-(T-1), T,
-                         device=x.device).to(torch.float32).unsqueeze(1)
+        x = torch.arange(-(T - 1), T, device=x.device).to(torch.float32).unsqueeze(1)
 
         freqs = 1 + torch.arange(self.embed_dim // 2, device=x.device)
 
         # `compression_length` this is arbitrary/heuristic, if it is larger we have more resolution
         # for small time offsets but less resolution for large time offsets.
-        compression_length = (self.embed_dim ** 0.5)
+        compression_length = self.embed_dim**0.5
         # x_compressed, like X, goes from -infinity to infinity as T goes from -infinity to infinity;
         # but it does so more slowly than T for large absolute values of T.
         # The formula is chosen so that d(x_compressed )/dx is 1 around x == 0, which
         # is important.
-        x_compressed = compression_length * x.sign() * ((x.abs() + compression_length).log() - math.log(compression_length))
+        x_compressed = (
+            compression_length
+            * x.sign()
+            * ((x.abs() + compression_length).log() - math.log(compression_length))
+        )
 
         # if self.length_factor == 1.0, then length_scale is chosen so that the
         # FFT can exactly separate points close to the origin (T == 0).  So this
@@ -1109,7 +1306,7 @@ class CompactRelPositionalEncoding(torch.nn.Module):
         # note for machine implementations: if atan is not available, we can use:
         #   x.sign() * ((1 / (x.abs() + 1)) - 1)  * (-math.pi/2)
         #  check on wolframalpha.com: plot(sign(x) *  (1 / ( abs(x) + 1) - 1 ) * -pi/2 , atan(x))
-        x_atan = (x_compressed / length_scale).atan() # results between -pi and pi
+        x_atan = (x_compressed / length_scale).atan()  # results between -pi and pi
 
         cosines = (x_atan * freqs).cos()
         sines = (x_atan * freqs).sin()
@@ -1121,7 +1318,6 @@ class CompactRelPositionalEncoding(torch.nn.Module):
 
         self.pe = pe.to(dtype=x.dtype)
 
-
     def forward(self, x: torch.Tensor) -> Tensor:
         """Create positional encoding.
 
@@ -1138,13 +1334,12 @@ class CompactRelPositionalEncoding(torch.nn.Module):
             - x.size(0)
             + 1 : self.pe.size(0) // 2  # noqa E203
             + x.size(0),
-            :
+            :,
         ]
         pos_emb = pos_emb.unsqueeze(0)
         return self.dropout(pos_emb)
 
 
-
 class RelPositionMultiheadAttentionWeights(nn.Module):
     r"""Module that computes multi-head attention weights with relative position encoding.
     Various other modules consume the resulting attention weights: see, for example, the
@@ -1166,15 +1361,14 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
     """
 
     def __init__(
-            self,
-            embed_dim: int,
-            pos_dim: int,
-            num_heads: int,
-            query_head_dim: int,
-            pos_head_dim: int,
-            dropout: float = 0.0,
-            pos_emb_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5),
-                                                          (4000.0, 0.0))
+        self,
+        embed_dim: int,
+        pos_dim: int,
+        num_heads: int,
+        query_head_dim: int,
+        pos_head_dim: int,
+        dropout: float = 0.0,
+        pos_emb_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), (4000.0, 0.0)),
     ) -> None:
         super().__init__()
         self.embed_dim = embed_dim
@@ -1193,13 +1387,16 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # dividing it between the query and key.   Note: this module is intended
         # to be used with the ScaledAdam optimizer; with most other optimizers,
         # it would be necessary to apply the scaling factor in the forward function.
-        self.in_proj = ScaledLinear(embed_dim, in_proj_dim, bias=True,
-                                    initial_scale=query_head_dim**-0.25)
+        self.in_proj = ScaledLinear(
+            embed_dim, in_proj_dim, bias=True, initial_scale=query_head_dim**-0.25
+        )
 
-        self.whiten_keys = Whiten(num_groups=num_heads,
-                                  whitening_limit=_whitening_schedule(3.0),
-                                  prob=(0.025, 0.25),
-                                  grad_scale=0.025)
+        self.whiten_keys = Whiten(
+            num_groups=num_heads,
+            whitening_limit=_whitening_schedule(3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.025,
+        )
 
         # add a balancer for the keys that runs with very small probability, and
         # tries to enforce that all dimensions have mean around zero.  The
@@ -1209,27 +1406,25 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # bias because the small numerical roundoff tends to have a non-random
         # sign.  This module is intended to prevent that.  Use a very small
         # probability; that should be suffixient to fix the problem.
-        self.balance_keys = Balancer(key_head_dim * num_heads,
-                                     channel_dim=-1,
-                                     min_positive=0.4,
-                                     max_positive=0.6,
-                                     min_abs=0.0,
-                                     max_abs=100.0,
-                                     prob=0.025)
-
+        self.balance_keys = Balancer(
+            key_head_dim * num_heads,
+            channel_dim=-1,
+            min_positive=0.4,
+            max_positive=0.6,
+            min_abs=0.0,
+            max_abs=100.0,
+            prob=0.025,
+        )
 
         # linear transformation for positional encoding.
-        self.linear_pos = ScaledLinear(pos_dim,
-                                       num_heads * pos_head_dim,
-                                       bias=False,
-                                       initial_scale=0.05)
-
+        self.linear_pos = ScaledLinear(
+            pos_dim, num_heads * pos_head_dim, bias=False, initial_scale=0.05
+        )
 
         # the following are for diagnosics only, see --print-diagnostics option
         self.copy_pos_query = Identity()
         self.copy_query = Identity()
 
-
     def forward(
         self,
         x: Tensor,
@@ -1261,18 +1456,16 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
 
         query_dim = query_head_dim * num_heads
 
-        q = x[...,0:query_dim]
-        k = x[...,query_dim:2*query_dim]
+        q = x[..., 0:query_dim]
+        k = x[..., query_dim : 2 * query_dim]
         # p is the position-encoding query
-        p = x[...,2*query_dim:]
+        p = x[..., 2 * query_dim :]
         assert p.shape[-1] == num_heads * pos_head_dim
 
-
         q = self.copy_query(q)  # for diagnostics only, does nothing.
         k = self.whiten_keys(self.balance_keys(k))  # does nothing in the forward pass.
         p = self.copy_pos_query(p)  # for diagnostics only, does nothing.
 
-
         q = q.reshape(seq_len, batch_size, num_heads, query_head_dim)
         p = p.reshape(seq_len, batch_size, num_heads, pos_head_dim)
         k = k.reshape(seq_len, batch_size, num_heads, query_head_dim)
@@ -1287,7 +1480,9 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         if not self.training or random.random() >= float(self.pos_emb_skip_rate):
             pos_emb = self.linear_pos(pos_emb)
             seq_len2 = 2 * seq_len - 1
-            pos_emb = pos_emb.reshape(-1, seq_len2, num_heads, pos_head_dim).permute(2, 0, 3, 1)
+            pos_emb = pos_emb.reshape(-1, seq_len2, num_heads, pos_head_dim).permute(
+                2, 0, 3, 1
+            )
             # pos shape now: (head, {1 or batch_size}, pos_dim, seq_len2)
 
             # (head, batch, time1, pos_dim) x (head, 1, pos_dim, seq_len2) -> (head, batch, time1, seq_len2)
@@ -1296,12 +1491,16 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
             # the following .as_strided() expression converts the last axis of pos_scores from relative
             # to absolute position.  I don't know whether I might have got the time-offsets backwards or
             # not, but let this code define which way round it is supposed to be.
-            pos_scores = pos_scores.as_strided((num_heads, batch_size, seq_len, seq_len),
-                                               (pos_scores.stride(0),
-                                                pos_scores.stride(1),
-                                                pos_scores.stride(2)-pos_scores.stride(3),
-                                                pos_scores.stride(3)),
-                                               storage_offset=pos_scores.stride(3) * (seq_len - 1))
+            pos_scores = pos_scores.as_strided(
+                (num_heads, batch_size, seq_len, seq_len),
+                (
+                    pos_scores.stride(0),
+                    pos_scores.stride(1),
+                    pos_scores.stride(2) - pos_scores.stride(3),
+                    pos_scores.stride(3),
+                ),
+                storage_offset=pos_scores.stride(3) * (seq_len - 1),
+            )
 
             attn_scores = attn_scores + pos_scores
 
@@ -1318,10 +1517,9 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
             # but we view this as a failsafe to avoid "implausible" parameter
             # values rather than a regularization method that should be active
             # under normal circumstances.
-            attn_scores = penalize_abs_values_gt(attn_scores,
-                                                 limit=25.0,
-                                                 penalty=1.0e-04,
-                                                 name=self.name)
+            attn_scores = penalize_abs_values_gt(
+                attn_scores, limit=25.0, penalty=1.0e-04, name=self.name
+            )
 
         assert attn_scores.shape == (num_heads, batch_size, seq_len, seq_len)
 
@@ -1334,7 +1532,10 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
             attn_scores = attn_scores.masked_fill(attn_mask, -1000)
 
         if key_padding_mask is not None:
-            assert key_padding_mask.shape == (batch_size, seq_len), key_padding_mask.shape
+            assert key_padding_mask.shape == (
+                batch_size,
+                seq_len,
+            ), key_padding_mask.shape
             attn_scores = attn_scores.masked_fill(
                 key_padding_mask.unsqueeze(1),
                 -1000,
@@ -1355,19 +1556,21 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
 
         return attn_weights
 
-
-    def _print_attn_entropy(
-            self,
-            attn_weights: Tensor):
+    def _print_attn_entropy(self, attn_weights: Tensor):
         # attn_weights: (num_heads, batch_size, seq_len, seq_len)
         (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape
 
         with torch.no_grad():
             with torch.cuda.amp.autocast(enabled=False):
                 attn_weights = attn_weights.to(torch.float32)
-                attn_weights_entropy = -((attn_weights + 1.0e-20).log() * attn_weights).sum(
-                    dim=-1).mean(dim=(1,2))
-                logging.info(f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}")
+                attn_weights_entropy = (
+                    -((attn_weights + 1.0e-20).log() * attn_weights)
+                    .sum(dim=-1)
+                    .mean(dim=(1, 2))
+                )
+                logging.info(
+                    f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}"
+                )
 
 
 class Attention(nn.Module):
@@ -1381,27 +1584,28 @@ class Attention(nn.Module):
           num_heads: the number of attention heads
           value_head_dim: the value dimension per head
     """
+
     def __init__(
-            self,
-            embed_dim_in: int,
-            embed_dim_out: int,
-            num_heads: int,
-            value_head_dim: int,
+        self,
+        embed_dim_in: int,
+        embed_dim_out: int,
+        num_heads: int,
+        value_head_dim: int,
     ) -> None:
         super().__init__()
-        self.in_proj = nn.Linear(embed_dim_in,
-                                 num_heads * value_head_dim,
-                                 bias=False)
+        self.in_proj = nn.Linear(embed_dim_in, num_heads * value_head_dim, bias=False)
 
-        self.out_proj = ScaledLinear(num_heads * value_head_dim,
-                                     embed_dim_out, bias=False,
-                                     initial_scale=0.05)
-
-        self.whiten = Whiten(num_groups=1,
-                             whitening_limit=_whitening_schedule(7.5, ratio=3.0),
-                             prob=(0.025, 0.25),
-                             grad_scale=0.01)
+        # Note we set bias to False so that input of 0 will have no effect
+        self.out_proj = ScaledLinear(
+            num_heads * value_head_dim, embed_dim_out, bias=False, initial_scale=0.05
+        )
 
+        self.whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(7.5, ratio=3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
     def forward(
         self,
@@ -1412,14 +1616,14 @@ class Attention(nn.Module):
         Args:
           x: input tensor, of shape (seq_len, batch_size, embed_dim)
          attn_weights: a tensor of shape (num_heads, batch_size, query_len, key_len),
-          Expect attn_weights.sum(dim=-1) == 1. The input here is the value in the 
+          Expect attn_weights.sum(dim=-1) == 1. The input here is the value in the
           original attention mechanism.
         Returns:
            a tensor with the same shape as x.
         """
         (num_heads, batch_size, query_len, key_len) = attn_weights.shape
 
-        x = self.in_proj(x)     #  (key_len, batch_size, num_heads * value_head_dim)
+        x = self.in_proj(x)  # (key_len, batch_size, num_heads * value_head_dim)
         x = x.reshape(key_len, batch_size, num_heads, -1).permute(2, 1, 0, 3)
         # now x: (num_heads, batch_size, key_len, value_head_dim)
         value_head_dim = x.shape[-1]
@@ -1428,8 +1632,11 @@ class Attention(nn.Module):
         x = torch.matmul(attn_weights, x)
         # v: (num_heads, batch_size, query_len, value_head_dim)
 
-        x = x.permute(2, 1, 0, 3).contiguous().view(
-            query_len, batch_size, num_heads * value_head_dim)
+        x = (
+            x.permute(2, 1, 0, 3)
+            .contiguous()
+            .view(query_len, batch_size, num_heads * value_head_dim)
+        )
 
         # returned value is of shape (query_len, batch_size, embed_dim), like the input.
         x = self.out_proj(x)
@@ -1437,6 +1644,62 @@ class Attention(nn.Module):
 
         return x
 
+    def streaming_forward(
+        self,
+        x: Tensor,
+        attn_weights: Tensor,
+        cached_val: Tensor,
+        left_context_len: int,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            x: input tensor, of shape (seq_len, batch_size, embed_dim)
+            attn_weights: a tensor of shape (num_heads, batch_size, seq_len, seq_len),
+              with seq_len being interpreted as (tgt_seq_len, src_seq_len).  Expect
+              attn_weights.sum(dim=-1) == 1.
+            cached_val: cached attention value tensor of left context,
+              of shape (left_context_len, batch_size, value_dim)
+            left_context_len: number of left context frames.
+
+        Returns:
+           - attention weighted output, a tensor with the same shape as x.
+           - updated cached attention value tensor of left context.
+        """
+        (seq_len, batch_size, embed_dim) = x.shape
+        num_heads = attn_weights.shape[0]
+        seq_len2 = seq_len + left_context_len
+        assert attn_weights.shape == (num_heads, batch_size, seq_len, seq_len2)
+
+        x = self.in_proj(x)  # (seq_len, batch_size, num_heads * value_head_dim)
+
+        # Pad cached left contexts
+        assert cached_val.shape[0] == left_context_len, (
+            cached_val.shape[0],
+            left_context_len,
+        )
+        x = torch.cat([cached_val, x], dim=0)
+        # Update cached left contexts
+        cached_val = x[-left_context_len:, ...]
+
+        x = x.reshape(seq_len2, batch_size, num_heads, -1).permute(2, 1, 0, 3)
+        # now x: (num_heads, batch_size, seq_len, value_head_dim)
+        value_head_dim = x.shape[-1]
+
+        # todo: see whether there is benefit in overriding matmul
+        x = torch.matmul(attn_weights, x)
+        # v: (num_heads, batch_size, seq_len, value_head_dim)
+
+        x = (
+            x.permute(2, 1, 0, 3)
+            .contiguous()
+            .view(seq_len, batch_size, num_heads * value_head_dim)
+        )
+
+        # returned value is of shape (seq_len, batch_size, embed_dim), like the input.
+        x = self.out_proj(x)
+
+        return x, cached_val
+
 
 class MultiheadAttentionWeights(nn.Module):
     r"""Module that computes multi-head cross-attention weights.  Allows src and target
@@ -1453,13 +1716,12 @@ class MultiheadAttentionWeights(nn.Module):
     """
 
     def __init__(
-            self,
-            key_embed_dim: int,
-            query_embed_dim: int,
-            num_heads: int,
-            head_dim: int,
-            dropout: float = 0.0,
-
+        self,
+        key_embed_dim: int,
+        query_embed_dim: int,
+        num_heads: int,
+        head_dim: int,
+        dropout: float = 0.0,
     ) -> None:
         super().__init__()
         self.key_embed_dim = key_embed_dim
@@ -1469,30 +1731,33 @@ class MultiheadAttentionWeights(nn.Module):
         self.dropout = dropout
         self.name = None  # will be overwritten in training code; for diagnostics.
 
-
         # the initial_scale is supposed to take over the "scaling" factor of
         # head_dim ** -0.5 that has been used in previous forms of attention,
         # dividing it between the query and key.   Note: this module is intended
         # to be used with the ScaledAdam optimizer; with most other optimizers,
         # it would be necessary to apply the scaling factor in the forward function.
-        self.query_in_proj = ScaledLinear(query_embed_dim,
-                                          head_dim * num_heads,
-                                          bias=True,
-                                          initial_scale=head_dim ** -0.25)
+        self.query_in_proj = ScaledLinear(
+            query_embed_dim,
+            head_dim * num_heads,
+            bias=True,
+            initial_scale=head_dim**-0.25,
+        )
 
         # weights produced by this module are invariant to adding a constant to
         # the keys, so we don't need a bias for the keys.
-        self.key_in_proj = ScaledLinear(key_embed_dim,
-                                        head_dim * num_heads,
-                                        bias=False,
-                                        initial_scale=head_dim ** -0.25)
-
-        self.whiten_keys = Whiten(num_groups=num_heads,
-                                  whitening_limit=_whitening_schedule(3.0),
-                                  prob=(0.025, 0.25),
-                                  grad_scale=0.025)
-
+        self.key_in_proj = ScaledLinear(
+            key_embed_dim,
+            head_dim * num_heads,
+            bias=False,
+            initial_scale=head_dim**-0.25,
+        )
 
+        self.whiten_keys = Whiten(
+            num_groups=num_heads,
+            whitening_limit=_whitening_schedule(3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.025,
+        )
 
     def forward(
         self,
@@ -1519,7 +1784,7 @@ class MultiheadAttentionWeights(nn.Module):
         key_len, _batch_size, _ = k.shape
         assert _batch_size == batch_size
 
-        k = self.whiten_keys(k)   # does nothing in the forward pass.
+        k = self.whiten_keys(k)  # does nothing in the forward pass.
 
         q = q.reshape(query_len, batch_size, num_heads, head_dim)
         k = k.reshape(key_len, batch_size, num_heads, head_dim)
@@ -1543,15 +1808,17 @@ class MultiheadAttentionWeights(nn.Module):
             # but we view this as a failsafe to avoid "implausible" parameter
             # values rather than a regularization method that should be active
             # under normal circumstances.
-            attn_scores = penalize_abs_values_gt(attn_scores,
-                                                 limit=25.0,
-                                                 penalty=1.0e-04,
-                                                 name=self.name)
+            attn_scores = penalize_abs_values_gt(
+                attn_scores, limit=25.0, penalty=1.0e-04, name=self.name
+            )
 
         assert attn_scores.shape == (num_heads, batch_size, query_len, key_len)
 
         if key_padding_mask is not None:
-            assert key_padding_mask.shape == (batch_size, key_len), key_padding_mask.shape
+            assert key_padding_mask.shape == (
+                batch_size,
+                key_len,
+            ), key_padding_mask.shape
             attn_scores = attn_scores.masked_fill(
                 key_padding_mask.unsqueeze(1),
                 -1000,
@@ -1572,53 +1839,58 @@ class MultiheadAttentionWeights(nn.Module):
 
         return attn_weights
 
-
-    def _print_attn_entropy(
-            self,
-            attn_weights: Tensor):
+    def _print_attn_entropy(self, attn_weights: Tensor):
         # attn_weights: (num_heads, batch_size, seq_len, seq_len)
         (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape
 
         with torch.no_grad():
             with torch.cuda.amp.autocast(enabled=False):
                 attn_weights = attn_weights.to(torch.float32)
-                attn_weights_entropy = -((attn_weights + 1.0e-20).log() * attn_weights).sum(
-                    dim=-1).mean(dim=(1,2))
-                logging.info(f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}")
-
+                attn_weights_entropy = (
+                    -((attn_weights + 1.0e-20).log() * attn_weights)
+                    .sum(dim=-1)
+                    .mean(dim=(1, 2))
+                )
+                logging.info(
+                    f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}"
+                )
 
 
 class FeedforwardModule(nn.Module):
-    """Feedforward module in Zipformer2 model.
-    """
-    def __init__(self,
-                 embed_dim: int,
-                 feedforward_dim: int,
-                 dropout: FloatLike):
+    """Feedforward module in Zipformer2 model."""
+
+    def __init__(self, embed_dim: int, feedforward_dim: int, dropout: FloatLike):
         super(FeedforwardModule, self).__init__()
         self.in_proj = nn.Linear(embed_dim, feedforward_dim)
 
-        self.hidden_balancer = Balancer(feedforward_dim,
-                                        channel_dim=-1,
-                                        min_positive=0.3,
-                                        max_positive=1.0,
-                                        min_abs=0.75,
-                                        max_abs=5.0)
+        self.hidden_balancer = Balancer(
+            feedforward_dim,
+            channel_dim=-1,
+            min_positive=0.3,
+            max_positive=1.0,
+            min_abs=0.75,
+            max_abs=5.0,
+        )
 
         # shared_dim=0 means we share the dropout mask along the time axis
-        self.out_proj = ActivationDropoutAndLinear(feedforward_dim, embed_dim,
-                                                   activation='SwooshL',
-                                                   dropout_p=dropout,
-                                                   dropout_shared_dim=0, bias=True,
-                                                   initial_scale=0.1)
+        self.out_proj = ActivationDropoutAndLinear(
+            feedforward_dim,
+            embed_dim,
+            activation="SwooshL",
+            dropout_p=dropout,
+            dropout_shared_dim=0,
+            bias=True,
+            initial_scale=0.1,
+        )
 
-        self.out_whiten =  Whiten(num_groups=1,
-                                  whitening_limit=_whitening_schedule(7.5),
-                                  prob=(0.025, 0.25),
-                                  grad_scale=0.01)
+        self.out_whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(7.5),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
-    def forward(self,
-                x: Tensor):
+    def forward(self, x: Tensor):
         x = self.in_proj(x)
         x = self.hidden_balancer(x)
         # out_proj contains SwooshL activation, then dropout, then linear.
@@ -1637,9 +1909,9 @@ class NonlinAttention(nn.Module):
     """
 
     def __init__(
-            self,
-            channels: int,
-            hidden_channels: int,
+        self,
+        channels: int,
+        hidden_channels: int,
     ) -> None:
         super().__init__()
 
@@ -1652,7 +1924,8 @@ class NonlinAttention(nn.Module):
         # starting from about 3, and poorly-trained instances of the module have smaller abs values
         # before the sigmoid.
         self.balancer = Balancer(
-            hidden_channels, channel_dim=-1,
+            hidden_channels,
+            channel_dim=-1,
             min_positive=ScheduledFloat((0.0, 0.25), (20000.0, 0.05)),
             max_positive=ScheduledFloat((0.0, 0.75), (20000.0, 0.95)),
             min_abs=0.5,
@@ -1664,35 +1937,36 @@ class NonlinAttention(nn.Module):
         self.identity2 = Identity()  # for diagnostics.
         self.identity3 = Identity()  # for diagnostics.
 
-        self.out_proj = ScaledLinear(hidden_channels, channels,
-                                     bias=True,
-                                     initial_scale=0.05)
+        self.out_proj = ScaledLinear(
+            hidden_channels, channels, bias=True, initial_scale=0.05
+        )
 
+        self.whiten1 = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(5.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
+        self.whiten2 = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(5.0, ratio=3.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
 
-        self.whiten1 = Whiten(num_groups=1,
-                              whitening_limit=_whitening_schedule(5.0),
-                              prob=(0.025, 0.25),
-                              grad_scale=0.01)
-
-        self.whiten2 = Whiten(num_groups=1,
-                              whitening_limit=_whitening_schedule(5.0, ratio=3.0),
-                              prob=(0.025, 0.25),
-                              grad_scale=0.01)
-
-
-    def forward(self,
-                x: Tensor,
-                attn_weights: Tensor,
+    def forward(
+        self,
+        x: Tensor,
+        attn_weights: Tensor,
     ) -> Tensor:
         """.
-        Args:
-           x: a Tensor of shape (seq_len, batch_size, num_channels)
-attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
-        Returns:
-           a Tensor with the same shape as x
+                Args:
+                   x: a Tensor of shape (seq_len, batch_size, num_channels)
+        attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
+                Returns:
+                   a Tensor with the same shape as x
         """
-        num_channels = x.shape[-1]
         x = self.in_proj(x)
 
         (seq_len, batch_size, _) = x.shape
@@ -1720,7 +1994,6 @@ attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
         # now x: (num_heads, batch_size, seq_len, head_dim)
         x = x.permute(2, 1, 0, 3).reshape(seq_len, batch_size, -1)
 
-
         y = self.identity2(y)
         x = x * y
         x = self.identity3(x)
@@ -1729,6 +2002,67 @@ attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
         x = self.whiten2(x)
         return x
 
+    def streaming_forward(
+        self,
+        x: Tensor,
+        attn_weights: Tensor,
+        cached_x: Tensor,
+        left_context_len: int,
+    ) -> Tuple[Tensor, Tensor]:
+        """.
+        Args:
+            x: a Tensor of shape (seq_len, batch_size, num_channels)
+            attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
+            cached_x: left context, a Tensor of shape
+              (num_heads, batch_size, left_context_len, head_dim)
+            left_context_len: number of left context frames.
+        Returns:
+            - a Tensor with the same shape as x
+            - updated left context with same shape as cached_x
+        """
+        x = self.in_proj(x)
+
+        (seq_len, batch_size, _) = x.shape
+        hidden_channels = self.hidden_channels
+
+        s, x, y = x.chunk(3, dim=-1)
+
+        # s will go through tanh.
+        s = self.tanh(s)
+
+        s = s.unsqueeze(-1).reshape(seq_len, batch_size, hidden_channels)
+        x = x * s
+
+        (seq_len, batch_size, embed_dim) = x.shape
+        num_heads = attn_weights.shape[0]
+        assert attn_weights.shape == (
+            num_heads,
+            batch_size,
+            seq_len,
+            left_context_len + seq_len,
+        )
+
+        x = x.reshape(seq_len, batch_size, num_heads, -1).permute(2, 1, 0, 3)
+        # now x: (num_heads, batch_size, seq_len, head_dim)
+
+        # Pad cached tensor
+        assert cached_x.shape[2] == left_context_len, (
+            cached_x.shape[2],
+            left_context_len,
+        )
+        x_pad = torch.cat([cached_x, x], dim=2)
+        # Update cached tensor
+        cached_x = x_pad[:, :, -left_context_len:, :]
+
+        x = torch.matmul(attn_weights, x_pad)
+        # now x: (num_heads, batch_size, seq_len, head_dim)
+        x = x.permute(2, 1, 0, 3).reshape(seq_len, batch_size, -1)
+
+        x = x * y
+
+        x = self.out_proj(x)
+        return x, cached_x
+
 
 class ConvolutionModule(nn.Module):
     """ConvolutionModule in Zipformer2 model.
@@ -1740,8 +2074,12 @@ class ConvolutionModule(nn.Module):
         bias (bool): Whether to use bias in conv layers (default=True).
 
     """
+
     def __init__(
-            self, channels: int, kernel_size: int, causal: bool,
+        self,
+        channels: int,
+        kernel_size: int,
+        causal: bool,
     ) -> None:
         """Construct a ConvolutionModule object."""
         super(ConvolutionModule, self).__init__()
@@ -1752,15 +2090,17 @@ class ConvolutionModule(nn.Module):
         self.causal = causal
 
         self.in_proj = nn.Linear(
-            channels, 2 * bottleneck_dim,
+            channels,
+            2 * bottleneck_dim,
         )
         # the gradients on in_proj are a little noisy, likely to do with the
         # sigmoid in glu.
 
         # after in_proj we put x through a gated linear unit (nn.functional.glu).
-        # For most layers the normal rms value of channels of x seems to be in the range 1 to 4,
-        # but sometimes, for some reason, for layer 0 the rms ends up being very large,
-        # between 50 and 100 for different channels.  This will cause very peaky and
+        # For most layers the normal rms value of channels of x seems to be in
+        # the range 1 to 4, but sometimes, for some reason, for layer 0 the
+        # rms ends up being very large, between 50 and 100 for different channels.
+        # This will cause very peaky and
         # sparse derivatives for the sigmoid gating function, which will tend to make
         # the loss function not learn effectively.  (for most layers the average absolute values
         # are in the range 0.5..9.0, and the average p(x>0), i.e. positive proportion,
@@ -1771,52 +2111,63 @@ class ConvolutionModule(nn.Module):
         # it will be in a better position to start learning something, i.e. to latch onto
         # the correct range.
         self.balancer1 = Balancer(
-            bottleneck_dim, channel_dim=-1,
+            bottleneck_dim,
+            channel_dim=-1,
             min_positive=ScheduledFloat((0.0, 0.05), (8000.0, 0.025)),
             max_positive=1.0,
             min_abs=1.5,
             max_abs=ScheduledFloat((0.0, 5.0), (8000.0, 10.0), default=1.0),
         )
 
-        self.activation1 = Identity() # for diagnostics
+        self.activation1 = Identity()  # for diagnostics
 
         self.sigmoid = nn.Sigmoid()
 
-        self.activation2 = Identity() # for diagnostics
+        self.activation2 = Identity()  # for diagnostics
 
         assert kernel_size % 2 == 1
 
-        self.depthwise_conv = ChunkCausalDepthwiseConv1d(
-            channels=bottleneck_dim,
-            kernel_size=kernel_size) if causal else nn.Conv1d(
-            in_channels=bottleneck_dim,
-            out_channels=bottleneck_dim,
-            groups=bottleneck_dim,
-            kernel_size=kernel_size,
-            padding=kernel_size // 2)
+        self.depthwise_conv = (
+            ChunkCausalDepthwiseConv1d(channels=bottleneck_dim, kernel_size=kernel_size)
+            if causal
+            else nn.Conv1d(
+                in_channels=bottleneck_dim,
+                out_channels=bottleneck_dim,
+                groups=bottleneck_dim,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+            )
+        )
 
         self.balancer2 = Balancer(
-            bottleneck_dim, channel_dim=1,
+            bottleneck_dim,
+            channel_dim=1,
             min_positive=ScheduledFloat((0.0, 0.1), (8000.0, 0.05)),
             max_positive=1.0,
             min_abs=ScheduledFloat((0.0, 0.2), (20000.0, 0.5)),
             max_abs=10.0,
         )
 
-        self.whiten = Whiten(num_groups=1,
-                             whitening_limit=_whitening_schedule(7.5),
-                             prob=(0.025, 0.25),
-                             grad_scale=0.01)
-
-        self.out_proj = ActivationDropoutAndLinear(
-            bottleneck_dim, channels, activation='SwooshR',
-            dropout_p=0.0, initial_scale=0.05,
+        self.whiten = Whiten(
+            num_groups=1,
+            whitening_limit=_whitening_schedule(7.5),
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
         )
 
-    def forward(self,
-                x: Tensor,
-                src_key_padding_mask: Optional[Tensor] = None,
-                chunk_size: int = -1,
+        self.out_proj = ActivationDropoutAndLinear(
+            bottleneck_dim,
+            channels,
+            activation="SwooshR",
+            dropout_p=0.0,
+            initial_scale=0.05,
+        )
+
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+        chunk_size: int = -1,
     ) -> Tensor:
         """Compute convolution module.
 
@@ -1847,20 +2198,68 @@ class ConvolutionModule(nn.Module):
         if src_key_padding_mask is not None:
             x = x.masked_fill(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
 
-        if chunk_size >= 0:
-            assert self.causal, "Must initialize model with causal=True if you use chunk_size"
+        if (
+            not torch.jit.is_scripting()
+            and not torch.jit.is_tracing()
+            and chunk_size >= 0
+        ):
+            # Not support exporting a model for simulated streaming decoding
+            assert (
+                self.causal
+            ), "Must initialize model with causal=True if you use chunk_size"
             x = self.depthwise_conv(x, chunk_size=chunk_size)
         else:
             x = self.depthwise_conv(x)
 
         x = self.balancer2(x)
-        x = x.permute(2, 0, 1) # (time, batch, channels)
+        x = x.permute(2, 0, 1)  # (time, batch, channels)
 
         x = self.whiten(x)  # (time, batch, channels)
         x = self.out_proj(x)  # (time, batch, channels)
 
         return x
 
+    def streaming_forward(
+        self,
+        x: Tensor,
+        cache: Tensor,
+        src_key_padding_mask: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """Compute convolution module in streaming forward mode.
+
+        Args:
+            x: Input tensor (#time, batch, channels).
+            cache: cached left context for depthwise_conv of shape
+              (#batch, channels, left_pad)
+            src_key_padding_mask: the mask for the src keys per batch (optional):
+              (batch, #time), contains True in masked positions.
+
+        Returns:
+            - Output tensor (#time, batch, channels).
+            - Updated cache (#batch, channels, left_pad)
+        """
+
+        x = self.in_proj(x)  # (time, batch, 2*channels)
+
+        x, s = x.chunk(2, dim=2)
+        s = self.sigmoid(s)
+        x = x * s
+        # (time, batch, channels)
+
+        # exchange the temporal dimension and the feature dimension
+        x = x.permute(1, 2, 0)  # (#batch, channels, time).
+
+        if src_key_padding_mask is not None:
+            x = x.masked_fill(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
+
+        x, cache = self.depthwise_conv.streaming_forward(x, cache=cache)
+
+        x = x.permute(2, 0, 1)  # (time, batch, channels)
+
+        x = self.out_proj(x)  # (time, batch, channels)
+
+        return x, cache
+
 
 class ScalarMultiply(nn.Module):
     def __init__(self, scale: float):
@@ -1878,7 +2277,9 @@ def _test_zipformer_main(causal: bool = False):
     memory_dim = 100
 
     c = Zipformer2(
-        encoder_dim=(64, 96), encoder_unmasked_dim=(48, 64), num_heads=(4, 4),
+        encoder_dim=(64, 96),
+        encoder_unmasked_dim=(48, 64),
+        num_heads=(4, 4),
         causal=causal,
         chunk_size=(4,) if causal else (-1,),
         left_context_frames=(64,),