diff --git a/egs/grid/AVSR/audionet_ctc_asr/pretrained.py b/egs/grid/AVSR/audionet_ctc_asr/pretrained.py
index fe81ded6b..853a38066 100644
--- a/egs/grid/AVSR/audionet_ctc_asr/pretrained.py
+++ b/egs/grid/AVSR/audionet_ctc_asr/pretrained.py
@@ -209,7 +209,7 @@ def main():
 
     logging.info("Decoding started")
     features = fbank(waves)
-
+    # aud_padding = 480
     features_new = torch.zeros(len(features), 480, params.feature_dim).to(
         device
     )
diff --git a/egs/grid/AVSR/audionet_ctc_asr/utils.py b/egs/grid/AVSR/audionet_ctc_asr/utils.py
index dbe01bb25..03b1b4ec4 100644
--- a/egs/grid/AVSR/audionet_ctc_asr/utils.py
+++ b/egs/grid/AVSR/audionet_ctc_asr/utils.py
@@ -19,16 +19,14 @@ import torch
 
 def encode_supervisions(nnet_output_shape, batch):
     """
-    Encodes Lhotse's ``batch["supervisions"]`` dict into
+    Encodes the output of net and texts into
     a pair of torch Tensor, and a list of transcription strings.
 
     The supervision tensor has shape ``(batch_size, 3)``.
     Its second dimension contains information about sequence index [0],
     start frames [1] and num frames [2].
 
-    The batch items might become re-ordered during this operation -- the
-    returned tensor and list of strings are guaranteed to be consistent with
-    each other.
+    In GRID, the start frame of each audio sample is 0.
     """
     N, T, D = nnet_output_shape
 
diff --git a/egs/grid/AVSR/combinenet_ctc_avsr/model.py b/egs/grid/AVSR/combinenet_ctc_avsr/model.py
index 03652a223..c0bcffa04 100644
--- a/egs/grid/AVSR/combinenet_ctc_avsr/model.py
+++ b/egs/grid/AVSR/combinenet_ctc_avsr/model.py
@@ -135,7 +135,7 @@ class CombineNet(nn.Module):
         """
         Args:
           x_v:
-            Its shape is [N, 3, H, W]
+            Its shape is [N, 3, T, H, W]
           x_a:
             Its shape is [N, C, T]
         Returns:
diff --git a/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py b/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py
index 121d0cdd7..65801471d 100644
--- a/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py
+++ b/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py
@@ -200,6 +200,8 @@ def main():
         vid.append(array)
 
     L, H, W, C = vid[0].shape
+    # vid_padding = 75
+    # aud_padding = 480
     features_v = torch.zeros(len(vid), 75, H, W, C).to(device)
     for i in range(len(vid)):
         length = vid[i].shape[0]
diff --git a/egs/grid/AVSR/local/cvtransforms.py b/egs/grid/AVSR/local/cvtransforms.py
index c80387850..11f47a57d 100644
--- a/egs/grid/AVSR/local/cvtransforms.py
+++ b/egs/grid/AVSR/local/cvtransforms.py
@@ -15,7 +15,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+In this script, there are two functions:
+the function HorizontalFlip is to flip the images,
+the function ColorNormalize is to normalize the images.
+The above two functions is to augment the images.
 
+The input for the above functions is a sequence of images.
+"""
 import random
 
 
diff --git a/egs/grid/AVSR/local/dataset_audio.py b/egs/grid/AVSR/local/dataset_audio.py
index d7eba76c5..27868529f 100644
--- a/egs/grid/AVSR/local/dataset_audio.py
+++ b/egs/grid/AVSR/local/dataset_audio.py
@@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+"""
+This script is to load the audio data in GRID.
+The class dataset_audio makes each audio batch data have the same shape.
+"""
 import kaldifeat
 import numpy as np
 import os
@@ -28,14 +31,31 @@ from torch.utils.data import Dataset
 class dataset_audio(Dataset):
     def __init__(
         self,
-        video_path,
-        anno_path,
-        file_list,
-        aud_padding,
-        sample_rate,
-        feature_dim,
-        phase,
-    ):
+        video_path: str,
+        anno_path: str,
+        file_list: str,
+        aud_padding: int,
+        sample_rate: int,
+        feature_dim: int,
+        phase: str,
+    ) -> None:
+        """
+        Args:
+          video_path:
+            The dir path of the visual data.
+          anno_path:
+            The dir path of the texts data.
+          file_list:
+            The file which listing all samples for training or testing.
+          aud_padding:
+            The padding for each audio sample.
+          sample_rate:
+            The sample rate for extracting fbank feature.
+          feature_dim:
+            The dim for fbank feature.
+          phase:
+            "train" or "test".
+        """
         self.anno_path = anno_path
         self.aud_padding = aud_padding
         self.sample_rate = sample_rate
diff --git a/egs/grid/AVSR/local/dataset_av.py b/egs/grid/AVSR/local/dataset_av.py
index 5d056aef4..54e095ea0 100644
--- a/egs/grid/AVSR/local/dataset_av.py
+++ b/egs/grid/AVSR/local/dataset_av.py
@@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+"""
+This script is to load the pair of audio-visual data in GRID.
+The class dataset_av makes each audio-visual batch data have the same shape.
+"""
 import cv2
 import kaldifeat
 import numpy as np
@@ -40,6 +43,25 @@ class dataset_av(Dataset):
         sample_rate,
         phase,
     ):
+        """
+        Args:
+          video_path:
+            The dir path of the visual data.
+          anno_path:
+            The dir path of the texts data.
+          file_list:
+            The file which listing all samples for training or testing.
+          feature_dim:
+            The dimension for fbank feature.
+          vid_padding:
+            The padding for each visual sample.
+          aud_padding:
+            The padding for each audio sample.
+          sample_rate:
+            The sample rate for extracting fbank feature.
+          phase:
+            "train" or "test".
+        """
         self.anno_path = anno_path
         self.vid_pading = vid_pading
         self.aud_pading = aud_pading
diff --git a/egs/grid/AVSR/local/dataset_visual.py b/egs/grid/AVSR/local/dataset_visual.py
index 04c45ba46..f0f2f21ae 100644
--- a/egs/grid/AVSR/local/dataset_visual.py
+++ b/egs/grid/AVSR/local/dataset_visual.py
@@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+"""
+This script is to load the visual data in GRID.
+The class dataset_visual makes each visual batch data have the same shape.
+"""
 import cv2
 import os
 import numpy as np
@@ -27,12 +30,25 @@ from .cvtransforms import HorizontalFlip, ColorNormalize
 class dataset_visual(Dataset):
     def __init__(
         self,
-        video_path,
-        anno_path,
-        file_list,
-        vid_padding,
-        phase,
+        video_path: str,
+        anno_path: str,
+        file_list: str,
+        vid_padding: int,
+        phase: str,
     ):
+        """
+        Args:
+          video_path:
+            The dir path of the visual data.
+          anno_path:
+            The dir path of the texts data.
+          file_list:
+            The file which listing all samples for training or testing.
+          vid_padding:
+            The padding for each visual sample.
+          phase:
+            "train" or "test".
+        """
         self.anno_path = anno_path
         self.vid_padding = vid_padding
         self.phase = phase
diff --git a/egs/grid/AVSR/visualnet2_ctc_vsr/model.py b/egs/grid/AVSR/visualnet2_ctc_vsr/model.py
index bf3ceadd0..b5dd5879c 100644
--- a/egs/grid/AVSR/visualnet2_ctc_vsr/model.py
+++ b/egs/grid/AVSR/visualnet2_ctc_vsr/model.py
@@ -116,7 +116,12 @@ class ResNet(nn.Module):
 
 class VisualNet2(nn.Module):
     def __init__(self, num_classes):
-        super(VisualNet2, self).__init__()
+        """
+        Args:
+          num_classes:
+            The output dimension of the visualnet2 model.
+        """
+        super().__init__()
         self.num_classes = num_classes
         self.inputDim = 512
         self.conv3d = nn.Conv3d(
diff --git a/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py b/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py
index 08589db57..94eef79f7 100644
--- a/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py
+++ b/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py
@@ -180,6 +180,7 @@ def main():
         vid.append(array)
 
     _, H, W, C = vid[0].shape
+    # vid_padding = 75
     features = torch.zeros(len(vid), 75, H, W, C).to(device)
     for i in range(len(vid)):
         length = vid[i].shape[0]
diff --git a/egs/grid/AVSR/visualnet_ctc_vsr/model.py b/egs/grid/AVSR/visualnet_ctc_vsr/model.py
index 19ecc1fcd..6ff30ff16 100644
--- a/egs/grid/AVSR/visualnet_ctc_vsr/model.py
+++ b/egs/grid/AVSR/visualnet_ctc_vsr/model.py
@@ -21,8 +21,13 @@ import torch.nn as nn
 
 
 class VisualNet(torch.nn.Module):
-    def __init__(self, num_classes, dropout_p=0.1):
-        super(VisualNet, self).__init__()
+    def __init__(self, num_classes: int) -> None:
+        """
+        Args:
+          num_classes:
+            The output dimension of the visualnet model.
+        """
+        super().__init__()
         self.num_classes = num_classes
         self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2))
         self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
@@ -37,7 +42,7 @@ class VisualNet(torch.nn.Module):
         self.gru2 = nn.GRU(512, 256, 1, bidirectional=True)
 
         self.FC = nn.Linear(512, self.num_classes)
-        self.dropout_p = dropout_p
+        self.dropout_p = 0.5
 
         self.relu = nn.ReLU(inplace=True)
         self.dropout = nn.Dropout(self.dropout_p)