diff --git a/egs/librispeech/ASR/zipformer/zipformer.py b/egs/librispeech/ASR/zipformer/zipformer.py
index 612356a50..d25c6e5ff 100644
--- a/egs/librispeech/ASR/zipformer/zipformer.py
+++ b/egs/librispeech/ASR/zipformer/zipformer.py
@@ -133,7 +133,6 @@ class Zipformer2(EncoderInterface):
         self.encoder_dim = encoder_dim = _to_tuple(encoder_dim) # tuple
         self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(encoder_unmasked_dim) # tuple
         num_encoder_layers = _to_tuple(num_encoder_layers)
-        self.num_encoder_layers = num_encoder_layers
         self.query_head_dim = query_head_dim = _to_tuple(query_head_dim)
         self.value_head_dim = value_head_dim = _to_tuple(value_head_dim)
         pos_head_dim = _to_tuple(pos_head_dim)
@@ -259,7 +258,7 @@ class Zipformer2(EncoderInterface):
         if not self.causal:
             return -1, -1
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             assert len(self.chunk_size) == 1, self.chunk_size
             chunk_size = self.chunk_size[0]
         else:
@@ -268,7 +267,7 @@ class Zipformer2(EncoderInterface):
         if chunk_size == -1:
             left_context_chunks = -1
         else:
-            if torch.jit.is_scripting() or torch.jit.is_tracing():
+            if torch.jit.is_scripting():
                 assert len(self.left_context_frames) == 1, self.left_context_frames
                 left_context_frames = self.left_context_frames[0]
             else:
@@ -302,14 +301,14 @@ class Zipformer2(EncoderInterface):
               of frames in `embeddings` before padding.
         """
         outputs = []
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             feature_masks = [1.0] * len(self.encoder_dim)
         else:
             feature_masks = self.get_feature_masks(x)
 
         chunk_size, left_context_chunks = self.get_chunk_info()
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             # Not support exporting a model for simulating streaming decoding
             attn_mask = None
         else:
@@ -335,7 +334,7 @@ class Zipformer2(EncoderInterface):
         x = self.downsample_output(x)
         # class Downsample has this rounding behavior..
         assert self.output_downsampling_factor == 2
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             lengths = (x_lens + 1) // 2
         else:
             with warnings.catch_warnings():
@@ -373,7 +372,7 @@ class Zipformer2(EncoderInterface):
         # t is frame index, shape (seq_len,)
         t = torch.arange(seq_len, dtype=torch.int32, device=x.device)
         # c is chunk index for each frame, shape (seq_len,)
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             c = t // chunk_size
         else:
             with warnings.catch_warnings():
@@ -545,13 +544,15 @@ class Zipformer2EncoderLayer(nn.Module):
             bypass_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), (4000.0, 0.02), default=0),
     ) -> None:
         super(Zipformer2EncoderLayer, self).__init__()
+        embed_dim = embed_dim >> 1
         self.embed_dim = embed_dim
 
         # self.bypass implements layer skipping as well as bypass; see its default values.
-        self.bypass = BypassModule(embed_dim, skip_rate=bypass_skip_rate,
+        self.bypass = BypassModule(embed_dim * 2, skip_rate=bypass_skip_rate,
                                    straight_through_rate=0)
         # bypass_mid is bypass used in the middle of the layer.
-        self.bypass_mid = BypassModule(embed_dim, straight_through_rate=0)
+        self.bypass_mid_a = BypassModule(embed_dim, straight_through_rate=0)
+        self.bypass_mid_b = BypassModule(embed_dim, straight_through_rate=0)
 
         # skip probability for dynamic modules (meaning: anything but feedforward).
         self.attention_skip_rate = copy.deepcopy(attention_skip_rate)
@@ -566,48 +567,71 @@ class Zipformer2EncoderLayer(nn.Module):
 
         self.const_attention_rate = copy.deepcopy(const_attention_rate)
 
-        self.self_attn_weights = RelPositionMultiheadAttentionWeights(
+        self.cross_attn_weights_a = RelPositionMultiheadAttentionWeights(
+            embed_dim, pos_dim=pos_dim, num_heads=num_heads,
+            query_head_dim=query_head_dim, pos_head_dim=pos_head_dim,
+            dropout=0.0,
+        )
+        self.cross_attn_weights_b = RelPositionMultiheadAttentionWeights(
             embed_dim, pos_dim=pos_dim, num_heads=num_heads,
             query_head_dim=query_head_dim, pos_head_dim=pos_head_dim,
             dropout=0.0,
         )
 
-        self.self_attn1 = SelfAttention(embed_dim, num_heads,
+        self.cross_attn1_a = CrossAttention(embed_dim, num_heads,
+                                        value_head_dim)
+        self.cross_attn1_b = CrossAttention(embed_dim, num_heads,
                                         value_head_dim)
 
-        self.self_attn2 = SelfAttention(embed_dim, num_heads,
+        self.cross_attn2_a = CrossAttention(embed_dim, num_heads,
+                                        value_head_dim)
+        self.cross_attn2_b = CrossAttention(embed_dim, num_heads,
                                         value_head_dim)
 
-        self.feed_forward1 = FeedforwardModule(embed_dim,
+        self.feed_forward1_a = FeedforwardModule(embed_dim,
+                                               (feedforward_dim * 3) // 4,
+                                               dropout)
+        self.feed_forward1_b = FeedforwardModule(embed_dim,
                                                (feedforward_dim * 3) // 4,
                                                dropout)
 
-        self.feed_forward2 = FeedforwardModule(embed_dim,
+        self.feed_forward2_a = FeedforwardModule(embed_dim,
+                                               feedforward_dim,
+                                               dropout)
+        self.feed_forward2_b = FeedforwardModule(embed_dim,
                                                feedforward_dim,
                                                dropout)
 
-        self.feed_forward3 = FeedforwardModule(embed_dim,
+        self.feed_forward3_a = FeedforwardModule(embed_dim,
+                                               (feedforward_dim * 5) // 4,
+                                               dropout)
+        self.feed_forward3_b = FeedforwardModule(embed_dim,
                                                (feedforward_dim * 5) // 4,
                                                dropout)
 
-        self.nonlin_attention = NonlinAttention(embed_dim,
+        self.nonlin_attention_a = NonlinAttention(embed_dim,
+                                                hidden_channels=3 * embed_dim // 4)
+        self.nonlin_attention_b = NonlinAttention(embed_dim,
                                                 hidden_channels=3 * embed_dim // 4)
 
-        self.conv_module1 = ConvolutionModule(embed_dim,
+        self.conv_module1_a = ConvolutionModule(embed_dim,
+                                              cnn_module_kernel,
+                                              causal=causal)
+        self.conv_module1_b = ConvolutionModule(embed_dim,
                                               cnn_module_kernel,
                                               causal=causal)
 
-        self.conv_module2 = ConvolutionModule(embed_dim,
+        self.conv_module2_a = ConvolutionModule(embed_dim,
+                                              cnn_module_kernel,
+                                              causal=causal)
+        self.conv_module2_b = ConvolutionModule(embed_dim,
                                               cnn_module_kernel,
                                               causal=causal)
 
-        # TODO: remove it
-        self.bypass_scale = nn.Parameter(torch.full((embed_dim,), 0.5))
-
-        self.norm = BiasNorm(embed_dim)
+        self.norm = BiasNorm(embed_dim * 2)
 
         self.balancer1 = Balancer(
-            embed_dim, channel_dim=-1,
+            embed_dim * 2, channel_dim=-1,
             min_positive=0.45, max_positive=0.55,
             min_abs=0.2, max_abs=4.0,
         )
@@ -645,13 +669,13 @@ class Zipformer2EncoderLayer(nn.Module):
                              grad_scale=0.01)
 
         self.balancer2 = Balancer(
-            embed_dim, channel_dim=-1,
+            embed_dim * 2, channel_dim=-1,
             min_positive=0.45, max_positive=0.55,
             min_abs=0.1, max_abs=4.0,
         )
 
     def get_sequence_dropout_mask(self, x: Tensor, dropout_rate: float) -> Optional[Tensor]:
-        if dropout_rate == 0.0 or not self.training or torch.jit.is_scripting() or torch.jit.is_tracing():
+        if dropout_rate == 0.0 or not self.training or torch.jit.is_scripting():
             return None
         batch_size = x.shape[1]
         mask = (torch.rand(batch_size, 1, device=x.device) > dropout_rate).to(x.dtype)
@@ -693,86 +717,120 @@ class Zipformer2EncoderLayer(nn.Module):
         Returns:
            A tensor which has the same shape as src
         """
-        src_orig = src
+        src_a, src_b = torch.split(src, self.embed_dim, 2)
+        src_orig_a, src_orig_b = src_a, src_b
 
         # dropout rate for non-feedforward submodules
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             attention_skip_rate = 0.0
         else:
             attention_skip_rate = float(self.attention_skip_rate) if self.training else 0.0
 
         # attn_weights: (num_heads, batch_size, seq_len, seq_len)
-        attn_weights = self.self_attn_weights(
-            src,
+        attn_weights_a = self.cross_attn_weights_a(
+            src_a,
+            src_b,
+            pos_emb=pos_emb,
+            attn_mask=attn_mask,
+            key_padding_mask=src_key_padding_mask,
+        )
+        attn_weights_b = self.cross_attn_weights_b(
+            src_b,
+            src_a,
             pos_emb=pos_emb,
             attn_mask=attn_mask,
             key_padding_mask=src_key_padding_mask,
         )
 
-        src = src + self.feed_forward1(src)
+        src_a = src_a + self.feed_forward1_a(src_a)
+        src_b = src_b + self.feed_forward1_b(src_b)
 
-        self_attn_dropout_mask = self.get_sequence_dropout_mask(src, attention_skip_rate)
+        cross_attn_dropout_mask = self.get_sequence_dropout_mask(src_a, attention_skip_rate)
 
-        selected_attn_weights = attn_weights[0:1]
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        selected_attn_weights_a = attn_weights_a[0:1]
+        selected_attn_weights_b = attn_weights_b[0:1]
+        if torch.jit.is_scripting():
             pass
         elif not self.training and random.random() < float(self.const_attention_rate):
             # Make attention weights constant.  The intention is to
             # encourage these modules to do something similar to an
             # averaging-over-time operation.
             # only need the mask, can just use the 1st one and expand later
-            selected_attn_weights = selected_attn_weights[0:1]
-            selected_attn_weights = (selected_attn_weights > 0.0).to(selected_attn_weights.dtype)
-            selected_attn_weights = selected_attn_weights * (1.0 / selected_attn_weights.sum(dim=-1, keepdim=True))
+            selected_attn_weights_a = selected_attn_weights_a[0:1]
+            selected_attn_weights_b = selected_attn_weights_b[0:1]
+            selected_attn_weights_a = (selected_attn_weights_a > 0.0).to(selected_attn_weights_a.dtype)
+            selected_attn_weights_b = (selected_attn_weights_b > 0.0).to(selected_attn_weights_b.dtype)
+            selected_attn_weights_a = selected_attn_weights_a * (1.0 / selected_attn_weights_a.sum(dim=-1, keepdim=True))
+            selected_attn_weights_b = selected_attn_weights_b * (1.0 / selected_attn_weights_b.sum(dim=-1, keepdim=True))
 
-        na = self.balancer_na(self.nonlin_attention(src, selected_attn_weights))
+        na_a = self.balancer_na(self.nonlin_attention_a(src_b, selected_attn_weights_a))
+        na_b = self.balancer_na(self.nonlin_attention_b(src_a, selected_attn_weights_b))
 
-        src = src + (na if self_attn_dropout_mask is None else na * self_attn_dropout_mask)
+        src_a = src_a + (na_a if cross_attn_dropout_mask is None else na_a * cross_attn_dropout_mask)
+        src_b = src_b + (na_b if cross_attn_dropout_mask is None else na_b * cross_attn_dropout_mask)
 
-        self_attn = self.self_attn1(src, attn_weights)
+        cross_attn_a = self.cross_attn1_a(src_b, attn_weights_a)
+        cross_attn_b = self.cross_attn1_b(src_a, attn_weights_b)
 
-        src = src + (self_attn if self_attn_dropout_mask is None else self_attn * self_attn_dropout_mask)
+        src_a = src_a + (cross_attn_a if cross_attn_dropout_mask is None else cross_attn_a * cross_attn_dropout_mask)
+        src_b = src_b + (cross_attn_b if cross_attn_dropout_mask is None else cross_attn_b * cross_attn_dropout_mask)
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             conv_skip_rate = 0.0
         else:
             conv_skip_rate = float(self.conv_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.conv_module1(src, chunk_size=chunk_size,
+        src_a = src_a + self.sequence_dropout(self.conv_module1_a(src_a, chunk_size=chunk_size,
+                                                            src_key_padding_mask=src_key_padding_mask),
+                                          conv_skip_rate)
+        src_b = src_b + self.sequence_dropout(self.conv_module1_b(src_b, chunk_size=chunk_size,
                                                             src_key_padding_mask=src_key_padding_mask),
                                           conv_skip_rate)
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             ff2_skip_rate = 0.0
         else:
             ff2_skip_rate = float(self.ff2_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.balancer_ff2(self.feed_forward2(src)),
+        src_a = src_a + self.sequence_dropout(self.balancer_ff2(self.feed_forward2_a(src_a)),
+                                          ff2_skip_rate)
+        src_b = src_b + self.sequence_dropout(self.balancer_ff2(self.feed_forward2_b(src_b)),
                                           ff2_skip_rate)
 
         # bypass in the middle of the layer.
-        src = self.bypass_mid(src_orig, src)
+        src_a = self.bypass_mid_a(src_orig_a, src_a)
+        src_b = self.bypass_mid_b(src_orig_b, src_b)
 
-        self_attn = self.self_attn2(src, attn_weights)
+        cross_attn_a = self.cross_attn2_a(src_b, attn_weights_a)
+        cross_attn_b = self.cross_attn2_b(src_a, attn_weights_b)
 
-        src = src + (self_attn if self_attn_dropout_mask is None else self_attn * self_attn_dropout_mask)
+        src_a = src_a + (cross_attn_a if cross_attn_dropout_mask is None else cross_attn_a * cross_attn_dropout_mask)
+        src_b = src_b + (cross_attn_b if cross_attn_dropout_mask is None else cross_attn_b * cross_attn_dropout_mask)
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             conv_skip_rate = 0.0
         else:
             conv_skip_rate = float(self.conv_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.conv_module2(src, chunk_size=chunk_size,
+        src_a = src_a + self.sequence_dropout(self.conv_module2_a(src_a, chunk_size=chunk_size,
+                                                            src_key_padding_mask=src_key_padding_mask),
+                                          conv_skip_rate)
+        src_b = src_b + self.sequence_dropout(self.conv_module2_b(src_b, chunk_size=chunk_size,
                                                             src_key_padding_mask=src_key_padding_mask),
                                           conv_skip_rate)
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             ff3_skip_rate = 0.0
         else:
             ff3_skip_rate = float(self.ff3_skip_rate) if self.training else 0.0
-        src = src + self.sequence_dropout(self.balancer_ff3(self.feed_forward3(src)),
+        src_a = src_a + self.sequence_dropout(self.balancer_ff3(self.feed_forward3_a(src_a)),
                                           ff3_skip_rate)
+        src_b = src_b + self.sequence_dropout(self.balancer_ff3(self.feed_forward3_b(src_b)),
+                                          ff3_skip_rate)
+        
+        src = torch.cat([src_a, src_b], 2)
 
         src = self.balancer1(src)
         src = self.norm(src)
 
+        src_orig = torch.cat([src_orig_a, src_orig_b], 2)
         src = self.bypass(src_orig, src)
 
         src = self.balancer2(src)
@@ -828,7 +886,7 @@ class Zipformer2EncoderLayer(nn.Module):
         src_orig = src
 
         # attn_weights: (num_heads, batch_size, seq_len, seq_len)
-        attn_weights, cached_key = self.self_attn_weights.streaming_forward(
+        attn_weights, cached_key = self.cross_attn_weights.streaming_forward(
             src,
             pos_emb=pos_emb,
             cached_key=cached_key,
@@ -846,13 +904,13 @@ class Zipformer2EncoderLayer(nn.Module):
         )
         src = src + na
 
-        self_attn, cached_val1 = self.self_attn1.streaming_forward(
+        cross_attn, cached_val1 = self.cross_attn1.streaming_forward(
             src,
             attn_weights=attn_weights,
             cached_val=cached_val1,
             left_context_len=left_context_len,
         )
-        src = src + self_attn
+        src = src + cross_attn
 
         src_conv, cached_conv1 = self.conv_module1.streaming_forward(
             src,
@@ -866,13 +924,13 @@ class Zipformer2EncoderLayer(nn.Module):
         # bypass in the middle of the layer.
         src = self.bypass_mid(src_orig, src)
 
-        self_attn, cached_val2 = self.self_attn2.streaming_forward(
+        cross_attn, cached_val2 = self.cross_attn2.streaming_forward(
             src,
             attn_weights=attn_weights,
             cached_val=cached_val2,
             left_context_len=left_context_len,
         )
-        src = src + self_attn
+        src = src + cross_attn
 
         src_conv, cached_conv2 = self.conv_module2.streaming_forward(
             src,
@@ -969,7 +1027,7 @@ class Zipformer2Encoder(nn.Module):
         pos_emb = self.encoder_pos(src)
         output = src
 
-        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        if not torch.jit.is_scripting():
             output = output * feature_mask
 
         for i, mod in enumerate(self.layers):
@@ -981,7 +1039,7 @@ class Zipformer2Encoder(nn.Module):
                 src_key_padding_mask=src_key_padding_mask,
             )
 
-            if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            if not torch.jit.is_scripting():
                 output = output * feature_mask
 
         return output
@@ -1074,7 +1132,7 @@ class BypassModule(nn.Module):
         # or (batch_size, num_channels,).  This is actually the
         # scale on the non-residual term, so 0 correponds to bypassing
         # this module.
-        if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
+        if torch.jit.is_scripting() or not self.training:
             return self.bypass_scale
         else:
             ans = limit_param_value(self.bypass_scale,
@@ -1230,11 +1288,12 @@ class SimpleDownsample(torch.nn.Module):
         d_seq_len = (seq_len + ds - 1) // ds
 
         # Pad to an exact multiple of self.downsample
-        # right-pad src, repeating the last element.
-        pad = d_seq_len * ds - seq_len
-        src_extra = src[src.shape[0]-1:].expand(pad, src.shape[1], src.shape[2])
-        src = torch.cat((src, src_extra), dim=0)
-        assert src.shape[0] == d_seq_len * ds
+        if seq_len != d_seq_len * ds:
+            # right-pad src, repeating the last element.
+            pad = d_seq_len * ds - seq_len
+            src_extra = src[src.shape[0]-1:].expand(pad, src.shape[1], src.shape[2])
+            src = torch.cat((src, src_extra), dim=0)
+            assert src.shape[0] == d_seq_len * ds
 
         src = src.reshape(d_seq_len, ds, batch_size, in_channels)
 
@@ -1322,7 +1381,11 @@ class CompactRelPositionalEncoding(torch.nn.Module):
             # self.pe contains both positive and negative parts
             # the length of self.pe is 2 * input_len - 1
             if self.pe.size(0) >= T * 2 - 1:
-                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                # Note: TorchScript doesn't implement operator== for torch.Device
+                if self.pe.dtype != x.dtype or str(self.pe.device) != str(
+                    x.device
+                ):
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                 return
 
         # if T == 4, x would contain [ -3, -2, 1, 0, 1, 2, 3 ]
@@ -1434,7 +1497,9 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # dividing it between the query and key.   Note: this module is intended
         # to be used with the ScaledAdam optimizer; with most other optimizers,
         # it would be necessary to apply the scaling factor in the forward function.
-        self.in_proj = ScaledLinear(embed_dim, in_proj_dim, bias=True,
+        self.in_proj_a = ScaledLinear(embed_dim, in_proj_dim, bias=True,
+                                    initial_scale=query_head_dim**-0.25)
+        self.in_proj_b = ScaledLinear(embed_dim, in_proj_dim, bias=True,
                                     initial_scale=query_head_dim**-0.25)
 
         self.whiten_keys = Whiten(num_groups=num_heads,
@@ -1471,6 +1536,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
     def forward(
         self,
         x: Tensor,
+        y: Tensor,
         pos_emb: Tensor,
         key_padding_mask: Optional[Tensor] = None,
         attn_mask: Optional[Tensor] = None,
@@ -1478,6 +1544,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         r"""
         Args:
             x: input of shape (seq_len, batch_size, embed_dim)
+            y: input of shape (seq_len, batch_size, embed_dim)
             pos_emb: Positional embedding tensor, of shape (1, 2*seq_len - 1, pos_dim)
             key_padding_mask: a bool tensor of shape (batch_size, seq_len).  Positions that
                are True in this mask will be ignored as sources in the attention weighting.
@@ -1488,7 +1555,8 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
            a tensor of attention weights, of shape (hum_heads, batch_size, seq_len, seq_len)
            interpreted as (hum_heads, batch_size, tgt_seq_len, src_seq_len).
         """
-        x = self.in_proj(x)
+        x = self.in_proj_a(x)
+        y = self.in_proj_b(y)
         query_head_dim = self.query_head_dim
         pos_head_dim = self.pos_head_dim
         num_heads = self.num_heads
@@ -1499,7 +1567,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
 
         # self-attention
         q = x[...,0:query_dim]
-        k = x[...,query_dim:2*query_dim]
+        k = y[...,query_dim:2*query_dim]
         # p is the position-encoding query
         p = x[...,2*query_dim:]
         assert p.shape[-1] == num_heads * pos_head_dim
@@ -1520,7 +1588,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         attn_scores = torch.matmul(q, k)
 
         use_pos_scores = False
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             # We can't put random.random() in the same line
             use_pos_scores = True
         elif not self.training or random.random() >= float(self.pos_emb_skip_rate):
@@ -1538,26 +1606,16 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
             # the following .as_strided() expression converts the last axis of pos_scores from relative
             # to absolute position.  I don't know whether I might have got the time-offsets backwards or
             # not, but let this code define which way round it is supposed to be.
-            if torch.jit.is_tracing():
-                (num_heads, batch_size, time1, n) = pos_scores.shape
-                rows = torch.arange(start=time1 - 1, end=-1, step=-1)
-                cols = torch.arange(seq_len)
-                rows = rows.repeat(batch_size * num_heads).unsqueeze(-1)
-                indexes = rows + cols
-                pos_scores = pos_scores.reshape(-1, n)
-                pos_scores = torch.gather(pos_scores, dim=1, index=indexes)
-                pos_scores = pos_scores.reshape(num_heads, batch_size, time1, seq_len)
-            else:
-                pos_scores = pos_scores.as_strided((num_heads, batch_size, seq_len, seq_len),
-                                                   (pos_scores.stride(0),
-                                                    pos_scores.stride(1),
-                                                    pos_scores.stride(2)-pos_scores.stride(3),
-                                                    pos_scores.stride(3)),
-                                                   storage_offset=pos_scores.stride(3) * (seq_len - 1))
+            pos_scores = pos_scores.as_strided((num_heads, batch_size, seq_len, seq_len),
+                                               (pos_scores.stride(0),
+                                                pos_scores.stride(1),
+                                                pos_scores.stride(2)-pos_scores.stride(3),
+                                                pos_scores.stride(3)),
+                                               storage_offset=pos_scores.stride(3) * (seq_len - 1))
 
             attn_scores = attn_scores + pos_scores
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             pass
         elif self.training and random.random() < 0.1:
             # This is a harder way of limiting the attention scores to not be
@@ -1600,7 +1658,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # half-precision output for backprop purposes.
         attn_weights = softmax(attn_scores, dim=-1)
 
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting():
             pass
         elif random.random() < 0.001 and not self.training:
             self._print_attn_entropy(attn_weights)
@@ -1678,26 +1736,15 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
         # (head, batch, time1, pos_dim) x (head, 1, pos_dim, seq_len2) -> (head, batch, time1, seq_len2)
         #  [where seq_len2 represents relative position.]
         pos_scores = torch.matmul(p, pos_emb)
-        
-        if torch.jit.is_tracing():
-            (num_heads, batch_size, time1, n) = pos_scores.shape
-            rows = torch.arange(start=time1 - 1, end=-1, step=-1)
-            cols = torch.arange(k_len)
-            rows = rows.repeat(batch_size * num_heads).unsqueeze(-1)
-            indexes = rows + cols
-            pos_scores = pos_scores.reshape(-1, n)
-            pos_scores = torch.gather(pos_scores, dim=1, index=indexes)
-            pos_scores = pos_scores.reshape(num_heads, batch_size, time1, k_len)
         # the following .as_strided() expression converts the last axis of pos_scores from relative
         # to absolute position.  I don't know whether I might have got the time-offsets backwards or
         # not, but let this code define which way round it is supposed to be.
-        else:
-            pos_scores = pos_scores.as_strided((num_heads, batch_size, seq_len, k_len),
-                                            (pos_scores.stride(0),
-                                                pos_scores.stride(1),
-                                                pos_scores.stride(2)-pos_scores.stride(3),
-                                                pos_scores.stride(3)),
-                                            storage_offset=pos_scores.stride(3) * (seq_len - 1))
+        pos_scores = pos_scores.as_strided((num_heads, batch_size, seq_len, k_len),
+                                           (pos_scores.stride(0),
+                                            pos_scores.stride(1),
+                                            pos_scores.stride(2)-pos_scores.stride(3),
+                                            pos_scores.stride(3)),
+                                           storage_offset=pos_scores.stride(3) * (seq_len - 1))
 
         attn_scores = attn_scores + pos_scores
 
@@ -1728,7 +1775,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
                 logging.info(f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}")
 
 
-class SelfAttention(nn.Module):
+class CrossAttention(nn.Module):
     """
     The simplest possible attention module.  This one works with already-computed attention
     weights, e.g. as computed by RelPositionMultiheadAttentionWeights.
@@ -2153,7 +2200,7 @@ class ConvolutionModule(nn.Module):
         if src_key_padding_mask is not None:
             x = x.masked_fill(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
 
-        if not torch.jit.is_scripting() and not torch.jit.is_tracing() and chunk_size >= 0:
+        if not torch.jit.is_scripting() and chunk_size >= 0:
             # Not support exporting a model for simulated streaming decoding
             assert self.causal, "Must initialize model with causal=True if you use chunk_size"
             x = self.depthwise_conv(x, chunk_size=chunk_size)