Add explaination for some functions and classes

This commit is contained in:
Mingshuang Luo 2021-12-30 10:24:47 +08:00
parent 283bd126c5
commit 7391f4febf
11 changed files with 102 additions and 26 deletions

View File

@ -209,7 +209,7 @@ def main():
logging.info("Decoding started")
features = fbank(waves)
# aud_padding = 480
features_new = torch.zeros(len(features), 480, params.feature_dim).to(
device
)

View File

@ -19,16 +19,14 @@ import torch
def encode_supervisions(nnet_output_shape, batch):
"""
Encodes Lhotse's ``batch["supervisions"]`` dict into
Encodes the output of net and texts into
a pair of torch Tensor, and a list of transcription strings.
The supervision tensor has shape ``(batch_size, 3)``.
Its second dimension contains information about sequence index [0],
start frames [1] and num frames [2].
The batch items might become re-ordered during this operation -- the
returned tensor and list of strings are guaranteed to be consistent with
each other.
In GRID, the start frame of each audio sample is 0.
"""
N, T, D = nnet_output_shape

View File

@ -135,7 +135,7 @@ class CombineNet(nn.Module):
"""
Args:
x_v:
Its shape is [N, 3, H, W]
Its shape is [N, 3, T, H, W]
x_a:
Its shape is [N, C, T]
Returns:

View File

@ -200,6 +200,8 @@ def main():
vid.append(array)
L, H, W, C = vid[0].shape
# vid_padding = 75
# aud_padding = 480
features_v = torch.zeros(len(vid), 75, H, W, C).to(device)
for i in range(len(vid)):
length = vid[i].shape[0]

View File

@ -15,7 +15,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
In this script, there are two functions:
the function HorizontalFlip is to flip the images,
the function ColorNormalize is to normalize the images.
The above two functions is to augment the images.
The input for the above functions is a sequence of images.
"""
import random

View File

@ -15,7 +15,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is to load the audio data in GRID.
The class dataset_audio makes each audio batch data have the same shape.
"""
import kaldifeat
import numpy as np
import os
@ -28,14 +31,31 @@ from torch.utils.data import Dataset
class dataset_audio(Dataset):
def __init__(
self,
video_path,
anno_path,
file_list,
aud_padding,
sample_rate,
feature_dim,
phase,
):
video_path: str,
anno_path: str,
file_list: str,
aud_padding: int,
sample_rate: int,
feature_dim: int,
phase: str,
) -> None:
"""
Args:
video_path:
The dir path of the visual data.
anno_path:
The dir path of the texts data.
file_list:
The file which listing all samples for training or testing.
aud_padding:
The padding for each audio sample.
sample_rate:
The sample rate for extracting fbank feature.
feature_dim:
The dim for fbank feature.
phase:
"train" or "test".
"""
self.anno_path = anno_path
self.aud_padding = aud_padding
self.sample_rate = sample_rate

View File

@ -15,7 +15,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is to load the pair of audio-visual data in GRID.
The class dataset_av makes each audio-visual batch data have the same shape.
"""
import cv2
import kaldifeat
import numpy as np
@ -40,6 +43,25 @@ class dataset_av(Dataset):
sample_rate,
phase,
):
"""
Args:
video_path:
The dir path of the visual data.
anno_path:
The dir path of the texts data.
file_list:
The file which listing all samples for training or testing.
feature_dim:
The dimension for fbank feature.
vid_padding:
The padding for each visual sample.
aud_padding:
The padding for each audio sample.
sample_rate:
The sample rate for extracting fbank feature.
phase:
"train" or "test".
"""
self.anno_path = anno_path
self.vid_pading = vid_pading
self.aud_pading = aud_pading

View File

@ -15,7 +15,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is to load the visual data in GRID.
The class dataset_visual makes each visual batch data have the same shape.
"""
import cv2
import os
import numpy as np
@ -27,12 +30,25 @@ from .cvtransforms import HorizontalFlip, ColorNormalize
class dataset_visual(Dataset):
def __init__(
self,
video_path,
anno_path,
file_list,
vid_padding,
phase,
video_path: str,
anno_path: str,
file_list: str,
vid_padding: int,
phase: str,
):
"""
Args:
video_path:
The dir path of the visual data.
anno_path:
The dir path of the texts data.
file_list:
The file which listing all samples for training or testing.
vid_padding:
The padding for each visual sample.
phase:
"train" or "test".
"""
self.anno_path = anno_path
self.vid_padding = vid_padding
self.phase = phase

View File

@ -116,7 +116,12 @@ class ResNet(nn.Module):
class VisualNet2(nn.Module):
def __init__(self, num_classes):
super(VisualNet2, self).__init__()
"""
Args:
num_classes:
The output dimension of the visualnet2 model.
"""
super().__init__()
self.num_classes = num_classes
self.inputDim = 512
self.conv3d = nn.Conv3d(

View File

@ -180,6 +180,7 @@ def main():
vid.append(array)
_, H, W, C = vid[0].shape
# vid_padding = 75
features = torch.zeros(len(vid), 75, H, W, C).to(device)
for i in range(len(vid)):
length = vid[i].shape[0]

View File

@ -21,8 +21,13 @@ import torch.nn as nn
class VisualNet(torch.nn.Module):
def __init__(self, num_classes, dropout_p=0.1):
super(VisualNet, self).__init__()
def __init__(self, num_classes: int) -> None:
"""
Args:
num_classes:
The output dimension of the visualnet model.
"""
super().__init__()
self.num_classes = num_classes
self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2))
self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
@ -37,7 +42,7 @@ class VisualNet(torch.nn.Module):
self.gru2 = nn.GRU(512, 256, 1, bidirectional=True)
self.FC = nn.Linear(512, self.num_classes)
self.dropout_p = dropout_p
self.dropout_p = 0.5
self.relu = nn.ReLU(inplace=True)
self.dropout = nn.Dropout(self.dropout_p)