diff --git a/egs/librispeech/ASR/conformer_ctc/conformer.py b/egs/librispeech/ASR/conformer_ctc/conformer.py
index 75beeb529..50a46ae28 100644
--- a/egs/librispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/conformer.py
@@ -23,6 +23,7 @@ import torch
 from torch import Tensor, nn
 from conv1d_abs_attention import Conv1dAbs
 from transformer import Supervisions, Transformer, encoder_padding_mask
+import logging
 
 
 class Conformer(Transformer):
@@ -157,6 +158,7 @@ class ConformerEncoderLayer(nn.Module):
         normalize_before: bool = True,
     ) -> None:
         super(ConformerEncoderLayer, self).__init__()
+        
         self.self_attn = RelPositionMultiheadAttention(
             d_model, nhead, dropout=0.0
         )
@@ -180,6 +182,7 @@ class ConformerEncoderLayer(nn.Module):
             d_model
         )  # for the macaron style FNN module
         self.norm_ff = nn.LayerNorm(d_model)  # for the FNN module
+        
         self.norm_mha = nn.LayerNorm(d_model)  # for the MHA module
 
         # define layernorm for conv1d_abs
@@ -198,16 +201,13 @@ class ConformerEncoderLayer(nn.Module):
         self.normalize_before = normalize_before
 
         self.kernel_size = 31
-        self.padding = int((self.kernel_size - 1) / 2)
-        self.in_conv1d_channels = 768
-        self.out_conv1d_channels = 768
+        self.padding = int((self.kernel_size-1)/2)
+        self.in_conv1d_channels = 768 
+        self.out_conv1d_channels = 768 
+        # kernel size=21, self.conv1d_channels=768
+        # kernel size=5, self.conv1d_channels=1024
         self.linear1 = nn.Linear(512, self.in_conv1d_channels)
-        self.conv1d_abs = Conv1dAbs(
-            self.in_conv1d_channels,
-            self.out_conv1d_channels,
-            kernel_size=self.kernel_size,
-            padding=self.padding,
-        )
+        self.conv1d_abs = Conv1dAbs(self.in_conv1d_channels, self.out_conv1d_channels, kernel_size=self.kernel_size, padding=self.padding, padding_mode="replicate")
         self.linear2 = nn.Linear(self.out_conv1d_channels, 512)
 
     def forward(
@@ -233,7 +233,7 @@ class ConformerEncoderLayer(nn.Module):
             src_key_padding_mask: (N, S).
             S is the source sequence length, N is the batch size, E is the feature number
         """
-
+        
         # macaron style feed forward module
         residual = src
         if self.normalize_before:
@@ -244,7 +244,7 @@ class ConformerEncoderLayer(nn.Module):
         if not self.normalize_before:
             src = self.norm_ff_macaron(src)
 
-        # multi-head attention module
+        # multi-head attention
         residual = src
         if self.normalize_before:
             src = self.norm_mha(src)
@@ -260,20 +260,21 @@ class ConformerEncoderLayer(nn.Module):
         if not self.normalize_before:
             src = self.norm_mha(src)
 
-        # conv1dabs modified attention module
+        # conv1dabs modified attention
         residual = src
         if self.normalize_before:
             src = self.norm_conv_abs(src)
-
-        # src = self.linear1(src * 0.25)
-        src = 0.01 * self.linear1(src * 0.25)
+        
+        #src = self.linear1(src*0.25)
+        src = 0.01*self.linear1(src*0.25)
         src = torch.exp(src.clamp(min=-75, max=75))
-        src = src.permute(1, 2, 0)
+        src = src.permute(1, 2, 0)  # (B, D, T)
+        src = src.permute(0, 2, 1)  # (B, T, D)
         src = self.conv1d_abs(src) / self.kernel_size
         src = src.permute(2, 0, 1)
-        src = torch.log(src.clamp(min=1e-20))
+        src = torch.log(0.01 + src.clamp(min=1e-20))
         src = self.linear2(src)
-        src = 0.25 * self.layernorm(src)
+        src = 0.25*self.layernorm(src)
 
         src = residual + self.dropout(src)
         if not self.normalize_before:
@@ -415,8 +416,8 @@ class RelPositionalEncoding(torch.nn.Module):
         pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
 
         # Reserve the order of positive indices and concat both positive and
-        # negative indices. This is used to support the shifting trick as in "T
-        # ransformer-XL:Attentive Language Models Beyond a Fixed-Length Context"
+        # negative indices. This is used to support the shifting trick
+        # as in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
         pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
         pe_negative = pe_negative[1:].unsqueeze(0)
         pe = torch.cat([pe_positive, pe_negative], dim=1)
@@ -443,19 +444,14 @@ class RelPositionalEncoding(torch.nn.Module):
         ]
         return self.dropout(x), self.dropout(pos_emb)
 
-
 class RelPositionMultiheadAttention(nn.Module):
     r"""Multi-Head Attention layer with relative position encoding
-
     See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
-
     Args:
         embed_dim: total dimension of the model.
         num_heads: parallel attention heads.
         dropout: a Dropout layer on attn_output_weights. Default: 0.0.
-
     Examples::
-
         >>> rel_pos_multihead_attn = RelPositionMultiheadAttention(embed_dim, num_heads)
         >>> attn_output, attn_output_weights = multihead_attn(query, key, value, pos_emb)
     """
@@ -517,7 +513,6 @@ class RelPositionMultiheadAttention(nn.Module):
             need_weights: output attn_output_weights.
             attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
                 the batches while a 3D mask allows to specify a different mask for the entries of each batch.
-
         Shape:
             - Inputs:
             - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
@@ -539,7 +534,6 @@ class RelPositionMultiheadAttention(nn.Module):
             while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
             is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
             is provided, it will be added to the attention weight.
-
             - Outputs:
             - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
             E is the embedding dimension.
@@ -566,11 +560,9 @@ class RelPositionMultiheadAttention(nn.Module):
 
     def rel_shift(self, x: Tensor) -> Tensor:
         """Compute relative positional encoding.
-
         Args:
             x: Input tensor (batch, head, time1, 2*time1-1).
                 time1 means the length of query vector.
-
         Returns:
             Tensor: tensor of shape (batch, head, time1, time2)
           (note: time2 has the same value as time1, but it is for
@@ -623,7 +615,6 @@ class RelPositionMultiheadAttention(nn.Module):
             need_weights: output attn_output_weights.
             attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
                 the batches while a 3D mask allows to specify a different mask for the entries of each batch.
-
         Shape:
             Inputs:
             - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
@@ -645,7 +636,6 @@ class RelPositionMultiheadAttention(nn.Module):
             while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
             are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
             is provided, it will be added to the attention weight.
-
             Outputs:
             - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
             E is the embedding dimension.
@@ -865,7 +855,6 @@ class RelPositionMultiheadAttention(nn.Module):
         else:
             return attn_output, None
 
-
 class ConvolutionModule(nn.Module):
     """ConvolutionModule in Conformer model.
     Modified from
diff --git a/egs/librispeech/ASR/conformer_ctc/conv1d_abs_attention.py b/egs/librispeech/ASR/conformer_ctc/conv1d_abs_attention.py
index 031868628..84ff0e9fb 100644
--- a/egs/librispeech/ASR/conformer_ctc/conv1d_abs_attention.py
+++ b/egs/librispeech/ASR/conformer_ctc/conv1d_abs_attention.py
@@ -153,6 +153,19 @@ class _ConvNd(Module):
         if not hasattr(self, "padding_mode"):
             self.padding_mode = "zeros"
 
+import torch
+import torch.nn as nn
+m = nn.Tanh()
+
+def padding(input, padding_length):
+    # input shape : (B, T, D)
+    device = input.device
+    B, T, D = input.shape
+    src = torch.ones(B, T + 2*padding_length[0], D).to(device)
+    src[:, padding_length[0]:T+padding_length[0], :] = input
+    src = src.permute(0, 2, 1) # src shape: (B, D, T')
+
+    return src
 
 class Conv1dAbs(_ConvNd):
     def __init__(
@@ -188,13 +201,14 @@ class Conv1dAbs(_ConvNd):
     def forward(self, input: Tensor) -> Tensor:
         if self.padding_mode != "zeros":
             return F.conv1d(
-                F.pad(
-                    input,
-                    self._reversed_padding_repeated_twice,
-                    mode=self.padding_mode,
-                ),
-                torch.abs(self.weight),
-                torch.abs(self.bias),
+                # F.pad(
+                #    input,
+                #    self._reversed_padding_repeated_twice,
+                #    mode=self.padding_mode,
+                # ),
+                padding(input, self.padding),
+                torch.exp(self.weight),
+                torch.exp(self.bias),
                 self.stride,
                 _single(0),
                 self.dilation,