diff --git a/egs/grid/AVSR/audionet_ctc_asr/pretrained.py b/egs/grid/AVSR/audionet_ctc_asr/pretrained.py index fe81ded6b..853a38066 100644 --- a/egs/grid/AVSR/audionet_ctc_asr/pretrained.py +++ b/egs/grid/AVSR/audionet_ctc_asr/pretrained.py @@ -209,7 +209,7 @@ def main(): logging.info("Decoding started") features = fbank(waves) - + # aud_padding = 480 features_new = torch.zeros(len(features), 480, params.feature_dim).to( device ) diff --git a/egs/grid/AVSR/audionet_ctc_asr/utils.py b/egs/grid/AVSR/audionet_ctc_asr/utils.py index dbe01bb25..03b1b4ec4 100644 --- a/egs/grid/AVSR/audionet_ctc_asr/utils.py +++ b/egs/grid/AVSR/audionet_ctc_asr/utils.py @@ -19,16 +19,14 @@ import torch def encode_supervisions(nnet_output_shape, batch): """ - Encodes Lhotse's ``batch["supervisions"]`` dict into + Encodes the output of net and texts into a pair of torch Tensor, and a list of transcription strings. The supervision tensor has shape ``(batch_size, 3)``. Its second dimension contains information about sequence index [0], start frames [1] and num frames [2]. - The batch items might become re-ordered during this operation -- the - returned tensor and list of strings are guaranteed to be consistent with - each other. + In GRID, the start frame of each audio sample is 0. """ N, T, D = nnet_output_shape diff --git a/egs/grid/AVSR/combinenet_ctc_avsr/model.py b/egs/grid/AVSR/combinenet_ctc_avsr/model.py index 03652a223..c0bcffa04 100644 --- a/egs/grid/AVSR/combinenet_ctc_avsr/model.py +++ b/egs/grid/AVSR/combinenet_ctc_avsr/model.py @@ -135,7 +135,7 @@ class CombineNet(nn.Module): """ Args: x_v: - Its shape is [N, 3, H, W] + Its shape is [N, 3, T, H, W] x_a: Its shape is [N, C, T] Returns: diff --git a/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py b/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py index 121d0cdd7..65801471d 100644 --- a/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py +++ b/egs/grid/AVSR/combinenet_ctc_avsr/pretrained.py @@ -200,6 +200,8 @@ def main(): vid.append(array) L, H, W, C = vid[0].shape + # vid_padding = 75 + # aud_padding = 480 features_v = torch.zeros(len(vid), 75, H, W, C).to(device) for i in range(len(vid)): length = vid[i].shape[0] diff --git a/egs/grid/AVSR/local/cvtransforms.py b/egs/grid/AVSR/local/cvtransforms.py index c80387850..11f47a57d 100644 --- a/egs/grid/AVSR/local/cvtransforms.py +++ b/egs/grid/AVSR/local/cvtransforms.py @@ -15,7 +15,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +In this script, there are two functions: +the function HorizontalFlip is to flip the images, +the function ColorNormalize is to normalize the images. +The above two functions is to augment the images. +The input for the above functions is a sequence of images. +""" import random diff --git a/egs/grid/AVSR/local/dataset_audio.py b/egs/grid/AVSR/local/dataset_audio.py index d7eba76c5..27868529f 100644 --- a/egs/grid/AVSR/local/dataset_audio.py +++ b/egs/grid/AVSR/local/dataset_audio.py @@ -15,7 +15,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - +""" +This script is to load the audio data in GRID. +The class dataset_audio makes each audio batch data have the same shape. +""" import kaldifeat import numpy as np import os @@ -28,14 +31,31 @@ from torch.utils.data import Dataset class dataset_audio(Dataset): def __init__( self, - video_path, - anno_path, - file_list, - aud_padding, - sample_rate, - feature_dim, - phase, - ): + video_path: str, + anno_path: str, + file_list: str, + aud_padding: int, + sample_rate: int, + feature_dim: int, + phase: str, + ) -> None: + """ + Args: + video_path: + The dir path of the visual data. + anno_path: + The dir path of the texts data. + file_list: + The file which listing all samples for training or testing. + aud_padding: + The padding for each audio sample. + sample_rate: + The sample rate for extracting fbank feature. + feature_dim: + The dim for fbank feature. + phase: + "train" or "test". + """ self.anno_path = anno_path self.aud_padding = aud_padding self.sample_rate = sample_rate diff --git a/egs/grid/AVSR/local/dataset_av.py b/egs/grid/AVSR/local/dataset_av.py index 5d056aef4..54e095ea0 100644 --- a/egs/grid/AVSR/local/dataset_av.py +++ b/egs/grid/AVSR/local/dataset_av.py @@ -15,7 +15,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - +""" +This script is to load the pair of audio-visual data in GRID. +The class dataset_av makes each audio-visual batch data have the same shape. +""" import cv2 import kaldifeat import numpy as np @@ -40,6 +43,25 @@ class dataset_av(Dataset): sample_rate, phase, ): + """ + Args: + video_path: + The dir path of the visual data. + anno_path: + The dir path of the texts data. + file_list: + The file which listing all samples for training or testing. + feature_dim: + The dimension for fbank feature. + vid_padding: + The padding for each visual sample. + aud_padding: + The padding for each audio sample. + sample_rate: + The sample rate for extracting fbank feature. + phase: + "train" or "test". + """ self.anno_path = anno_path self.vid_pading = vid_pading self.aud_pading = aud_pading diff --git a/egs/grid/AVSR/local/dataset_visual.py b/egs/grid/AVSR/local/dataset_visual.py index 04c45ba46..f0f2f21ae 100644 --- a/egs/grid/AVSR/local/dataset_visual.py +++ b/egs/grid/AVSR/local/dataset_visual.py @@ -15,7 +15,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - +""" +This script is to load the visual data in GRID. +The class dataset_visual makes each visual batch data have the same shape. +""" import cv2 import os import numpy as np @@ -27,12 +30,25 @@ from .cvtransforms import HorizontalFlip, ColorNormalize class dataset_visual(Dataset): def __init__( self, - video_path, - anno_path, - file_list, - vid_padding, - phase, + video_path: str, + anno_path: str, + file_list: str, + vid_padding: int, + phase: str, ): + """ + Args: + video_path: + The dir path of the visual data. + anno_path: + The dir path of the texts data. + file_list: + The file which listing all samples for training or testing. + vid_padding: + The padding for each visual sample. + phase: + "train" or "test". + """ self.anno_path = anno_path self.vid_padding = vid_padding self.phase = phase diff --git a/egs/grid/AVSR/visualnet2_ctc_vsr/model.py b/egs/grid/AVSR/visualnet2_ctc_vsr/model.py index bf3ceadd0..b5dd5879c 100644 --- a/egs/grid/AVSR/visualnet2_ctc_vsr/model.py +++ b/egs/grid/AVSR/visualnet2_ctc_vsr/model.py @@ -116,7 +116,12 @@ class ResNet(nn.Module): class VisualNet2(nn.Module): def __init__(self, num_classes): - super(VisualNet2, self).__init__() + """ + Args: + num_classes: + The output dimension of the visualnet2 model. + """ + super().__init__() self.num_classes = num_classes self.inputDim = 512 self.conv3d = nn.Conv3d( diff --git a/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py b/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py index 08589db57..94eef79f7 100644 --- a/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py +++ b/egs/grid/AVSR/visualnet2_ctc_vsr/pretrained.py @@ -180,6 +180,7 @@ def main(): vid.append(array) _, H, W, C = vid[0].shape + # vid_padding = 75 features = torch.zeros(len(vid), 75, H, W, C).to(device) for i in range(len(vid)): length = vid[i].shape[0] diff --git a/egs/grid/AVSR/visualnet_ctc_vsr/model.py b/egs/grid/AVSR/visualnet_ctc_vsr/model.py index 19ecc1fcd..6ff30ff16 100644 --- a/egs/grid/AVSR/visualnet_ctc_vsr/model.py +++ b/egs/grid/AVSR/visualnet_ctc_vsr/model.py @@ -21,8 +21,13 @@ import torch.nn as nn class VisualNet(torch.nn.Module): - def __init__(self, num_classes, dropout_p=0.1): - super(VisualNet, self).__init__() + def __init__(self, num_classes: int) -> None: + """ + Args: + num_classes: + The output dimension of the visualnet model. + """ + super().__init__() self.num_classes = num_classes self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2)) self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2)) @@ -37,7 +42,7 @@ class VisualNet(torch.nn.Module): self.gru2 = nn.GRU(512, 256, 1, bidirectional=True) self.FC = nn.Linear(512, self.num_classes) - self.dropout_p = dropout_p + self.dropout_p = 0.5 self.relu = nn.ReLU(inplace=True) self.dropout = nn.Dropout(self.dropout_p)