mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 10:16:14 +00:00
Add explaination for some functions and classes
This commit is contained in:
parent
283bd126c5
commit
7391f4febf
@ -209,7 +209,7 @@ def main():
|
||||
|
||||
logging.info("Decoding started")
|
||||
features = fbank(waves)
|
||||
|
||||
# aud_padding = 480
|
||||
features_new = torch.zeros(len(features), 480, params.feature_dim).to(
|
||||
device
|
||||
)
|
||||
|
@ -19,16 +19,14 @@ import torch
|
||||
|
||||
def encode_supervisions(nnet_output_shape, batch):
|
||||
"""
|
||||
Encodes Lhotse's ``batch["supervisions"]`` dict into
|
||||
Encodes the output of net and texts into
|
||||
a pair of torch Tensor, and a list of transcription strings.
|
||||
|
||||
The supervision tensor has shape ``(batch_size, 3)``.
|
||||
Its second dimension contains information about sequence index [0],
|
||||
start frames [1] and num frames [2].
|
||||
|
||||
The batch items might become re-ordered during this operation -- the
|
||||
returned tensor and list of strings are guaranteed to be consistent with
|
||||
each other.
|
||||
In GRID, the start frame of each audio sample is 0.
|
||||
"""
|
||||
N, T, D = nnet_output_shape
|
||||
|
||||
|
@ -135,7 +135,7 @@ class CombineNet(nn.Module):
|
||||
"""
|
||||
Args:
|
||||
x_v:
|
||||
Its shape is [N, 3, H, W]
|
||||
Its shape is [N, 3, T, H, W]
|
||||
x_a:
|
||||
Its shape is [N, C, T]
|
||||
Returns:
|
||||
|
@ -200,6 +200,8 @@ def main():
|
||||
vid.append(array)
|
||||
|
||||
L, H, W, C = vid[0].shape
|
||||
# vid_padding = 75
|
||||
# aud_padding = 480
|
||||
features_v = torch.zeros(len(vid), 75, H, W, C).to(device)
|
||||
for i in range(len(vid)):
|
||||
length = vid[i].shape[0]
|
||||
|
@ -15,7 +15,14 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
In this script, there are two functions:
|
||||
the function HorizontalFlip is to flip the images,
|
||||
the function ColorNormalize is to normalize the images.
|
||||
The above two functions is to augment the images.
|
||||
|
||||
The input for the above functions is a sequence of images.
|
||||
"""
|
||||
import random
|
||||
|
||||
|
||||
|
@ -15,7 +15,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
"""
|
||||
This script is to load the audio data in GRID.
|
||||
The class dataset_audio makes each audio batch data have the same shape.
|
||||
"""
|
||||
import kaldifeat
|
||||
import numpy as np
|
||||
import os
|
||||
@ -28,14 +31,31 @@ from torch.utils.data import Dataset
|
||||
class dataset_audio(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
video_path,
|
||||
anno_path,
|
||||
file_list,
|
||||
aud_padding,
|
||||
sample_rate,
|
||||
feature_dim,
|
||||
phase,
|
||||
):
|
||||
video_path: str,
|
||||
anno_path: str,
|
||||
file_list: str,
|
||||
aud_padding: int,
|
||||
sample_rate: int,
|
||||
feature_dim: int,
|
||||
phase: str,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
video_path:
|
||||
The dir path of the visual data.
|
||||
anno_path:
|
||||
The dir path of the texts data.
|
||||
file_list:
|
||||
The file which listing all samples for training or testing.
|
||||
aud_padding:
|
||||
The padding for each audio sample.
|
||||
sample_rate:
|
||||
The sample rate for extracting fbank feature.
|
||||
feature_dim:
|
||||
The dim for fbank feature.
|
||||
phase:
|
||||
"train" or "test".
|
||||
"""
|
||||
self.anno_path = anno_path
|
||||
self.aud_padding = aud_padding
|
||||
self.sample_rate = sample_rate
|
||||
|
@ -15,7 +15,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
"""
|
||||
This script is to load the pair of audio-visual data in GRID.
|
||||
The class dataset_av makes each audio-visual batch data have the same shape.
|
||||
"""
|
||||
import cv2
|
||||
import kaldifeat
|
||||
import numpy as np
|
||||
@ -40,6 +43,25 @@ class dataset_av(Dataset):
|
||||
sample_rate,
|
||||
phase,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
video_path:
|
||||
The dir path of the visual data.
|
||||
anno_path:
|
||||
The dir path of the texts data.
|
||||
file_list:
|
||||
The file which listing all samples for training or testing.
|
||||
feature_dim:
|
||||
The dimension for fbank feature.
|
||||
vid_padding:
|
||||
The padding for each visual sample.
|
||||
aud_padding:
|
||||
The padding for each audio sample.
|
||||
sample_rate:
|
||||
The sample rate for extracting fbank feature.
|
||||
phase:
|
||||
"train" or "test".
|
||||
"""
|
||||
self.anno_path = anno_path
|
||||
self.vid_pading = vid_pading
|
||||
self.aud_pading = aud_pading
|
||||
|
@ -15,7 +15,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
"""
|
||||
This script is to load the visual data in GRID.
|
||||
The class dataset_visual makes each visual batch data have the same shape.
|
||||
"""
|
||||
import cv2
|
||||
import os
|
||||
import numpy as np
|
||||
@ -27,12 +30,25 @@ from .cvtransforms import HorizontalFlip, ColorNormalize
|
||||
class dataset_visual(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
video_path,
|
||||
anno_path,
|
||||
file_list,
|
||||
vid_padding,
|
||||
phase,
|
||||
video_path: str,
|
||||
anno_path: str,
|
||||
file_list: str,
|
||||
vid_padding: int,
|
||||
phase: str,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
video_path:
|
||||
The dir path of the visual data.
|
||||
anno_path:
|
||||
The dir path of the texts data.
|
||||
file_list:
|
||||
The file which listing all samples for training or testing.
|
||||
vid_padding:
|
||||
The padding for each visual sample.
|
||||
phase:
|
||||
"train" or "test".
|
||||
"""
|
||||
self.anno_path = anno_path
|
||||
self.vid_padding = vid_padding
|
||||
self.phase = phase
|
||||
|
@ -116,7 +116,12 @@ class ResNet(nn.Module):
|
||||
|
||||
class VisualNet2(nn.Module):
|
||||
def __init__(self, num_classes):
|
||||
super(VisualNet2, self).__init__()
|
||||
"""
|
||||
Args:
|
||||
num_classes:
|
||||
The output dimension of the visualnet2 model.
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
self.inputDim = 512
|
||||
self.conv3d = nn.Conv3d(
|
||||
|
@ -180,6 +180,7 @@ def main():
|
||||
vid.append(array)
|
||||
|
||||
_, H, W, C = vid[0].shape
|
||||
# vid_padding = 75
|
||||
features = torch.zeros(len(vid), 75, H, W, C).to(device)
|
||||
for i in range(len(vid)):
|
||||
length = vid[i].shape[0]
|
||||
|
@ -21,8 +21,13 @@ import torch.nn as nn
|
||||
|
||||
|
||||
class VisualNet(torch.nn.Module):
|
||||
def __init__(self, num_classes, dropout_p=0.1):
|
||||
super(VisualNet, self).__init__()
|
||||
def __init__(self, num_classes: int) -> None:
|
||||
"""
|
||||
Args:
|
||||
num_classes:
|
||||
The output dimension of the visualnet model.
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2))
|
||||
self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
|
||||
@ -37,7 +42,7 @@ class VisualNet(torch.nn.Module):
|
||||
self.gru2 = nn.GRU(512, 256, 1, bidirectional=True)
|
||||
|
||||
self.FC = nn.Linear(512, self.num_classes)
|
||||
self.dropout_p = dropout_p
|
||||
self.dropout_p = 0.5
|
||||
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.dropout = nn.Dropout(self.dropout_p)
|
||||
|
Loading…
x
Reference in New Issue
Block a user