Add explaination for some functions and classes

2025-08-26 18:24:18 +00:00 · 2021-12-30 10:24:47 +08:00 · 2021-12-30 10:24:47 +08:00 · 7391f4febf
commit 7391f4febf
parent 283bd126c5
11 changed files with 102 additions and 26 deletions
--- a/egs/grid/AVSR/audionet_ctc_asr/pretrained.py
+++ b/egs/grid/AVSR/audionet_ctc_asr/pretrained.py
@ -209,7 +209,7 @@ def main():
    logging.info("Decoding started")
    features = fbank(waves)
-
+    # aud_padding = 480
    features_new = torch.zeros(len(features), 480, params.feature_dim).to(
        device
    )
--- a/egs/grid/AVSR/audionet_ctc_asr/utils.py
+++ b/egs/grid/AVSR/audionet_ctc_asr/utils.py
@ -19,16 +19,14 @@ import torch
 def encode_supervisions(nnet_output_shape, batch):
    """
-    Encodes Lhotse's ``batch["supervisions"]`` dict into
+    Encodes the output of net and texts into
    a pair of torch Tensor, and a list of transcription strings.
    The supervision tensor has shape ``(batch_size, 3)``.
    Its second dimension contains information about sequence index [0],
    start frames [1] and num frames [2].
-    The batch items might become re-ordered during this operation -- the
+    In GRID, the start frame of each audio sample is 0.
    returned tensor and list of strings are guaranteed to be consistent with
    each other.
    """
    N, T, D = nnet_output_shape
--- a/egs/grid/AVSR/combinenet_ctc_avsr/model.py
+++ b/egs/grid/AVSR/combinenet_ctc_avsr/model.py
@ -135,7 +135,7 @@ class CombineNet(nn.Module):
        """
        Args:
          x_v:
-            Its shape is [N, 3, H, W]
+            Its shape is [N, 3, T, H, W]
          x_a:
            Its shape is [N, C, T]
        Returns:
--- a/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py
+++ b/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py
@ -200,6 +200,8 @@ def main():
        vid.append(array)
    L, H, W, C = vid[0].shape
    # vid_padding = 75
    # aud_padding = 480
    features_v = torch.zeros(len(vid), 75, H, W, C).to(device)
    for i in range(len(vid)):
        length = vid[i].shape[0]
--- a/egs/grid/AVSR/local/cvtransforms.py
+++ b/egs/grid/AVSR/local/cvtransforms.py
@ -15,7 +15,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 In this script, there are two functions:
 the function HorizontalFlip is to flip the images,
 the function ColorNormalize is to normalize the images.
 The above two functions is to augment the images.
 The input for the above functions is a sequence of images.
 """
 import random
--- a/egs/grid/AVSR/local/dataset_audio.py
+++ b/egs/grid/AVSR/local/dataset_audio.py
@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""
 This script is to load the audio data in GRID.
 The class dataset_audio makes each audio batch data have the same shape.
 """
 import kaldifeat
 import numpy as np
 import os
@ -28,14 +31,31 @@ from torch.utils.data import Dataset
 class dataset_audio(Dataset):
    def __init__(
        self,
-        video_path,
+        video_path: str,
-        anno_path,
+        anno_path: str,
-        file_list,
+        file_list: str,
-        aud_padding,
+        aud_padding: int,
-        sample_rate,
+        sample_rate: int,
-        feature_dim,
+        feature_dim: int,
-        phase,
+        phase: str,
-    ):
+    ) -> None:
        """
        Args:
          video_path:
            The dir path of the visual data.
          anno_path:
            The dir path of the texts data.
          file_list:
            The file which listing all samples for training or testing.
          aud_padding:
            The padding for each audio sample.
          sample_rate:
            The sample rate for extracting fbank feature.
          feature_dim:
            The dim for fbank feature.
          phase:
            "train" or "test".
        """
        self.anno_path = anno_path
        self.aud_padding = aud_padding
        self.sample_rate = sample_rate
--- a/egs/grid/AVSR/local/dataset_av.py
+++ b/egs/grid/AVSR/local/dataset_av.py
@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""
 This script is to load the pair of audio-visual data in GRID.
 The class dataset_av makes each audio-visual batch data have the same shape.
 """
 import cv2
 import kaldifeat
 import numpy as np
@ -40,6 +43,25 @@ class dataset_av(Dataset):
        sample_rate,
        phase,
    ):
        """
        Args:
          video_path:
            The dir path of the visual data.
          anno_path:
            The dir path of the texts data.
          file_list:
            The file which listing all samples for training or testing.
          feature_dim:
            The dimension for fbank feature.
          vid_padding:
            The padding for each visual sample.
          aud_padding:
            The padding for each audio sample.
          sample_rate:
            The sample rate for extracting fbank feature.
          phase:
            "train" or "test".
        """
        self.anno_path = anno_path
        self.vid_pading = vid_pading
        self.aud_pading = aud_pading
--- a/egs/grid/AVSR/local/dataset_visual.py
+++ b/egs/grid/AVSR/local/dataset_visual.py
@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""
 This script is to load the visual data in GRID.
 The class dataset_visual makes each visual batch data have the same shape.
 """
 import cv2
 import os
 import numpy as np
@ -27,12 +30,25 @@ from .cvtransforms import HorizontalFlip, ColorNormalize
 class dataset_visual(Dataset):
    def __init__(
        self,
-        video_path,
+        video_path: str,
-        anno_path,
+        anno_path: str,
-        file_list,
+        file_list: str,
-        vid_padding,
+        vid_padding: int,
-        phase,
+        phase: str,
    ):
        """
        Args:
          video_path:
            The dir path of the visual data.
          anno_path:
            The dir path of the texts data.
          file_list:
            The file which listing all samples for training or testing.
          vid_padding:
            The padding for each visual sample.
          phase:
            "train" or "test".
        """
        self.anno_path = anno_path
        self.vid_padding = vid_padding
        self.phase = phase
--- a/egs/grid/AVSR/visualnet2_ctc_vsr/model.py
+++ b/egs/grid/AVSR/visualnet2_ctc_vsr/model.py
@ -116,7 +116,12 @@ class ResNet(nn.Module):
 class VisualNet2(nn.Module):
    def __init__(self, num_classes):
-        super(VisualNet2, self).__init__()
+        """
        Args:
          num_classes:
            The output dimension of the visualnet2 model.
        """
        super().__init__()
        self.num_classes = num_classes
        self.inputDim = 512
        self.conv3d = nn.Conv3d(
--- a/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py
+++ b/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py
@ -180,6 +180,7 @@ def main():
        vid.append(array)
    _, H, W, C = vid[0].shape
    # vid_padding = 75
    features = torch.zeros(len(vid), 75, H, W, C).to(device)
    for i in range(len(vid)):
        length = vid[i].shape[0]
--- a/egs/grid/AVSR/visualnet_ctc_vsr/model.py
+++ b/egs/grid/AVSR/visualnet_ctc_vsr/model.py
@ -21,8 +21,13 @@ import torch.nn as nn
 class VisualNet(torch.nn.Module):
-    def __init__(self, num_classes, dropout_p=0.1):
+    def __init__(self, num_classes: int) -> None:
-        super(VisualNet, self).__init__()
+        """
        Args:
          num_classes:
            The output dimension of the visualnet model.
        """
        super().__init__()
        self.num_classes = num_classes
        self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2))
        self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
@ -37,7 +42,7 @@ class VisualNet(torch.nn.Module):
        self.gru2 = nn.GRU(512, 256, 1, bidirectional=True)
        self.FC = nn.Linear(512, self.num_classes)
-        self.dropout_p = dropout_p
+        self.dropout_p = 0.5
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(self.dropout_p)