Add explaination for some functions and classes

2025-08-26 18:24:18 +00:00 · 2021-12-30 10:24:47 +08:00 · 2021-12-30 10:24:47 +08:00 · 7391f4febf
commit 7391f4febf
parent 283bd126c5
11 changed files with 102 additions and 26 deletions
--- a/egs/grid/AVSR/audionet_ctc_asr/pretrained.py
+++ b/egs/grid/AVSR/audionet_ctc_asr/pretrained.py
@ -209,7 +209,7 @@ def main():

    logging.info("Decoding started")
    features = fbank(waves)
-
+    # aud_padding = 480
    features_new = torch.zeros(len(features), 480, params.feature_dim).to(
        device
    )
--- a/egs/grid/AVSR/audionet_ctc_asr/utils.py
+++ b/egs/grid/AVSR/audionet_ctc_asr/utils.py
@ -19,16 +19,14 @@ import torch

 def encode_supervisions(nnet_output_shape, batch):
    """
-    Encodes Lhotse's ``batch["supervisions"]`` dict into
+    Encodes the output of net and texts into
    a pair of torch Tensor, and a list of transcription strings.

    The supervision tensor has shape ``(batch_size, 3)``.
    Its second dimension contains information about sequence index [0],
    start frames [1] and num frames [2].

-    The batch items might become re-ordered during this operation -- the
-    returned tensor and list of strings are guaranteed to be consistent with
-    each other.
+    In GRID, the start frame of each audio sample is 0.
    """
    N, T, D = nnet_output_shape

--- a/egs/grid/AVSR/combinenet_ctc_avsr/model.py
+++ b/egs/grid/AVSR/combinenet_ctc_avsr/model.py
@ -135,7 +135,7 @@ class CombineNet(nn.Module):
        """
        Args:
          x_v:
-            Its shape is [N, 3, H, W]
+            Its shape is [N, 3, T, H, W]
          x_a:
            Its shape is [N, C, T]
        Returns:
--- a/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py
+++ b/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py
@ -200,6 +200,8 @@ def main():
        vid.append(array)

    L, H, W, C = vid[0].shape
+    # vid_padding = 75
+    # aud_padding = 480
    features_v = torch.zeros(len(vid), 75, H, W, C).to(device)
    for i in range(len(vid)):
        length = vid[i].shape[0]
--- a/egs/grid/AVSR/local/cvtransforms.py
+++ b/egs/grid/AVSR/local/cvtransforms.py
@ -15,7 +15,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+"""
+In this script, there are two functions:
+the function HorizontalFlip is to flip the images,
+the function ColorNormalize is to normalize the images.
+The above two functions is to augment the images.

+The input for the above functions is a sequence of images.
+"""
 import random


--- a/egs/grid/AVSR/local/dataset_audio.py
+++ b/egs/grid/AVSR/local/dataset_audio.py
@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+"""
+This script is to load the audio data in GRID.
+The class dataset_audio makes each audio batch data have the same shape.
+"""
 import kaldifeat
 import numpy as np
 import os
@ -28,14 +31,31 @@ from torch.utils.data import Dataset
 class dataset_audio(Dataset):
    def __init__(
        self,
-        video_path,
-        anno_path,
-        file_list,
-        aud_padding,
-        sample_rate,
-        feature_dim,
-        phase,
-    ):
+        video_path: str,
+        anno_path: str,
+        file_list: str,
+        aud_padding: int,
+        sample_rate: int,
+        feature_dim: int,
+        phase: str,
+    ) -> None:
+        """
+        Args:
+          video_path:
+            The dir path of the visual data.
+          anno_path:
+            The dir path of the texts data.
+          file_list:
+            The file which listing all samples for training or testing.
+          aud_padding:
+            The padding for each audio sample.
+          sample_rate:
+            The sample rate for extracting fbank feature.
+          feature_dim:
+            The dim for fbank feature.
+          phase:
+            "train" or "test".
+        """
        self.anno_path = anno_path
        self.aud_padding = aud_padding
        self.sample_rate = sample_rate
--- a/egs/grid/AVSR/local/dataset_av.py
+++ b/egs/grid/AVSR/local/dataset_av.py
@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+"""
+This script is to load the pair of audio-visual data in GRID.
+The class dataset_av makes each audio-visual batch data have the same shape.
+"""
 import cv2
 import kaldifeat
 import numpy as np
@ -40,6 +43,25 @@ class dataset_av(Dataset):
        sample_rate,
        phase,
    ):
+        """
+        Args:
+          video_path:
+            The dir path of the visual data.
+          anno_path:
+            The dir path of the texts data.
+          file_list:
+            The file which listing all samples for training or testing.
+          feature_dim:
+            The dimension for fbank feature.
+          vid_padding:
+            The padding for each visual sample.
+          aud_padding:
+            The padding for each audio sample.
+          sample_rate:
+            The sample rate for extracting fbank feature.
+          phase:
+            "train" or "test".
+        """
        self.anno_path = anno_path
        self.vid_pading = vid_pading
        self.aud_pading = aud_pading
--- a/egs/grid/AVSR/local/dataset_visual.py
+++ b/egs/grid/AVSR/local/dataset_visual.py
@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+"""
+This script is to load the visual data in GRID.
+The class dataset_visual makes each visual batch data have the same shape.
+"""
 import cv2
 import os
 import numpy as np
@ -27,12 +30,25 @@ from .cvtransforms import HorizontalFlip, ColorNormalize
 class dataset_visual(Dataset):
    def __init__(
        self,
-        video_path,
-        anno_path,
-        file_list,
-        vid_padding,
-        phase,
+        video_path: str,
+        anno_path: str,
+        file_list: str,
+        vid_padding: int,
+        phase: str,
    ):
+        """
+        Args:
+          video_path:
+            The dir path of the visual data.
+          anno_path:
+            The dir path of the texts data.
+          file_list:
+            The file which listing all samples for training or testing.
+          vid_padding:
+            The padding for each visual sample.
+          phase:
+            "train" or "test".
+        """
        self.anno_path = anno_path
        self.vid_padding = vid_padding
        self.phase = phase
--- a/egs/grid/AVSR/visualnet2_ctc_vsr/model.py
+++ b/egs/grid/AVSR/visualnet2_ctc_vsr/model.py
@ -116,7 +116,12 @@ class ResNet(nn.Module):

 class VisualNet2(nn.Module):
    def __init__(self, num_classes):
-        super(VisualNet2, self).__init__()
+        """
+        Args:
+          num_classes:
+            The output dimension of the visualnet2 model.
+        """
+        super().__init__()
        self.num_classes = num_classes
        self.inputDim = 512
        self.conv3d = nn.Conv3d(
--- a/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py
+++ b/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py
@ -180,6 +180,7 @@ def main():
        vid.append(array)

    _, H, W, C = vid[0].shape
+    # vid_padding = 75
    features = torch.zeros(len(vid), 75, H, W, C).to(device)
    for i in range(len(vid)):
        length = vid[i].shape[0]
--- a/egs/grid/AVSR/visualnet_ctc_vsr/model.py
+++ b/egs/grid/AVSR/visualnet_ctc_vsr/model.py
@ -21,8 +21,13 @@ import torch.nn as nn


 class VisualNet(torch.nn.Module):
-    def __init__(self, num_classes, dropout_p=0.1):
-        super(VisualNet, self).__init__()
+    def __init__(self, num_classes: int) -> None:
+        """
+        Args:
+          num_classes:
+            The output dimension of the visualnet model.
+        """
+        super().__init__()
        self.num_classes = num_classes
        self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2))
        self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
@ -37,7 +42,7 @@ class VisualNet(torch.nn.Module):
        self.gru2 = nn.GRU(512, 256, 1, bidirectional=True)

        self.FC = nn.Linear(512, self.num_classes)
-        self.dropout_p = dropout_p
+        self.dropout_p = 0.5

        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(self.dropout_p)