mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
Add explaination for some functions and classes
This commit is contained in:
parent
283bd126c5
commit
7391f4febf
@ -209,7 +209,7 @@ def main():
|
|||||||
|
|
||||||
logging.info("Decoding started")
|
logging.info("Decoding started")
|
||||||
features = fbank(waves)
|
features = fbank(waves)
|
||||||
|
# aud_padding = 480
|
||||||
features_new = torch.zeros(len(features), 480, params.feature_dim).to(
|
features_new = torch.zeros(len(features), 480, params.feature_dim).to(
|
||||||
device
|
device
|
||||||
)
|
)
|
||||||
|
@ -19,16 +19,14 @@ import torch
|
|||||||
|
|
||||||
def encode_supervisions(nnet_output_shape, batch):
|
def encode_supervisions(nnet_output_shape, batch):
|
||||||
"""
|
"""
|
||||||
Encodes Lhotse's ``batch["supervisions"]`` dict into
|
Encodes the output of net and texts into
|
||||||
a pair of torch Tensor, and a list of transcription strings.
|
a pair of torch Tensor, and a list of transcription strings.
|
||||||
|
|
||||||
The supervision tensor has shape ``(batch_size, 3)``.
|
The supervision tensor has shape ``(batch_size, 3)``.
|
||||||
Its second dimension contains information about sequence index [0],
|
Its second dimension contains information about sequence index [0],
|
||||||
start frames [1] and num frames [2].
|
start frames [1] and num frames [2].
|
||||||
|
|
||||||
The batch items might become re-ordered during this operation -- the
|
In GRID, the start frame of each audio sample is 0.
|
||||||
returned tensor and list of strings are guaranteed to be consistent with
|
|
||||||
each other.
|
|
||||||
"""
|
"""
|
||||||
N, T, D = nnet_output_shape
|
N, T, D = nnet_output_shape
|
||||||
|
|
||||||
|
@ -135,7 +135,7 @@ class CombineNet(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
x_v:
|
x_v:
|
||||||
Its shape is [N, 3, H, W]
|
Its shape is [N, 3, T, H, W]
|
||||||
x_a:
|
x_a:
|
||||||
Its shape is [N, C, T]
|
Its shape is [N, C, T]
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -200,6 +200,8 @@ def main():
|
|||||||
vid.append(array)
|
vid.append(array)
|
||||||
|
|
||||||
L, H, W, C = vid[0].shape
|
L, H, W, C = vid[0].shape
|
||||||
|
# vid_padding = 75
|
||||||
|
# aud_padding = 480
|
||||||
features_v = torch.zeros(len(vid), 75, H, W, C).to(device)
|
features_v = torch.zeros(len(vid), 75, H, W, C).to(device)
|
||||||
for i in range(len(vid)):
|
for i in range(len(vid)):
|
||||||
length = vid[i].shape[0]
|
length = vid[i].shape[0]
|
||||||
|
@ -15,7 +15,14 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
In this script, there are two functions:
|
||||||
|
the function HorizontalFlip is to flip the images,
|
||||||
|
the function ColorNormalize is to normalize the images.
|
||||||
|
The above two functions is to augment the images.
|
||||||
|
|
||||||
|
The input for the above functions is a sequence of images.
|
||||||
|
"""
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,7 +15,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script is to load the audio data in GRID.
|
||||||
|
The class dataset_audio makes each audio batch data have the same shape.
|
||||||
|
"""
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
@ -28,14 +31,31 @@ from torch.utils.data import Dataset
|
|||||||
class dataset_audio(Dataset):
|
class dataset_audio(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
video_path,
|
video_path: str,
|
||||||
anno_path,
|
anno_path: str,
|
||||||
file_list,
|
file_list: str,
|
||||||
aud_padding,
|
aud_padding: int,
|
||||||
sample_rate,
|
sample_rate: int,
|
||||||
feature_dim,
|
feature_dim: int,
|
||||||
phase,
|
phase: str,
|
||||||
):
|
) -> None:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
video_path:
|
||||||
|
The dir path of the visual data.
|
||||||
|
anno_path:
|
||||||
|
The dir path of the texts data.
|
||||||
|
file_list:
|
||||||
|
The file which listing all samples for training or testing.
|
||||||
|
aud_padding:
|
||||||
|
The padding for each audio sample.
|
||||||
|
sample_rate:
|
||||||
|
The sample rate for extracting fbank feature.
|
||||||
|
feature_dim:
|
||||||
|
The dim for fbank feature.
|
||||||
|
phase:
|
||||||
|
"train" or "test".
|
||||||
|
"""
|
||||||
self.anno_path = anno_path
|
self.anno_path = anno_path
|
||||||
self.aud_padding = aud_padding
|
self.aud_padding = aud_padding
|
||||||
self.sample_rate = sample_rate
|
self.sample_rate = sample_rate
|
||||||
|
@ -15,7 +15,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script is to load the pair of audio-visual data in GRID.
|
||||||
|
The class dataset_av makes each audio-visual batch data have the same shape.
|
||||||
|
"""
|
||||||
import cv2
|
import cv2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -40,6 +43,25 @@ class dataset_av(Dataset):
|
|||||||
sample_rate,
|
sample_rate,
|
||||||
phase,
|
phase,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
video_path:
|
||||||
|
The dir path of the visual data.
|
||||||
|
anno_path:
|
||||||
|
The dir path of the texts data.
|
||||||
|
file_list:
|
||||||
|
The file which listing all samples for training or testing.
|
||||||
|
feature_dim:
|
||||||
|
The dimension for fbank feature.
|
||||||
|
vid_padding:
|
||||||
|
The padding for each visual sample.
|
||||||
|
aud_padding:
|
||||||
|
The padding for each audio sample.
|
||||||
|
sample_rate:
|
||||||
|
The sample rate for extracting fbank feature.
|
||||||
|
phase:
|
||||||
|
"train" or "test".
|
||||||
|
"""
|
||||||
self.anno_path = anno_path
|
self.anno_path = anno_path
|
||||||
self.vid_pading = vid_pading
|
self.vid_pading = vid_pading
|
||||||
self.aud_pading = aud_pading
|
self.aud_pading = aud_pading
|
||||||
|
@ -15,7 +15,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script is to load the visual data in GRID.
|
||||||
|
The class dataset_visual makes each visual batch data have the same shape.
|
||||||
|
"""
|
||||||
import cv2
|
import cv2
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -27,12 +30,25 @@ from .cvtransforms import HorizontalFlip, ColorNormalize
|
|||||||
class dataset_visual(Dataset):
|
class dataset_visual(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
video_path,
|
video_path: str,
|
||||||
anno_path,
|
anno_path: str,
|
||||||
file_list,
|
file_list: str,
|
||||||
vid_padding,
|
vid_padding: int,
|
||||||
phase,
|
phase: str,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
video_path:
|
||||||
|
The dir path of the visual data.
|
||||||
|
anno_path:
|
||||||
|
The dir path of the texts data.
|
||||||
|
file_list:
|
||||||
|
The file which listing all samples for training or testing.
|
||||||
|
vid_padding:
|
||||||
|
The padding for each visual sample.
|
||||||
|
phase:
|
||||||
|
"train" or "test".
|
||||||
|
"""
|
||||||
self.anno_path = anno_path
|
self.anno_path = anno_path
|
||||||
self.vid_padding = vid_padding
|
self.vid_padding = vid_padding
|
||||||
self.phase = phase
|
self.phase = phase
|
||||||
|
@ -116,7 +116,12 @@ class ResNet(nn.Module):
|
|||||||
|
|
||||||
class VisualNet2(nn.Module):
|
class VisualNet2(nn.Module):
|
||||||
def __init__(self, num_classes):
|
def __init__(self, num_classes):
|
||||||
super(VisualNet2, self).__init__()
|
"""
|
||||||
|
Args:
|
||||||
|
num_classes:
|
||||||
|
The output dimension of the visualnet2 model.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
self.num_classes = num_classes
|
self.num_classes = num_classes
|
||||||
self.inputDim = 512
|
self.inputDim = 512
|
||||||
self.conv3d = nn.Conv3d(
|
self.conv3d = nn.Conv3d(
|
||||||
|
@ -180,6 +180,7 @@ def main():
|
|||||||
vid.append(array)
|
vid.append(array)
|
||||||
|
|
||||||
_, H, W, C = vid[0].shape
|
_, H, W, C = vid[0].shape
|
||||||
|
# vid_padding = 75
|
||||||
features = torch.zeros(len(vid), 75, H, W, C).to(device)
|
features = torch.zeros(len(vid), 75, H, W, C).to(device)
|
||||||
for i in range(len(vid)):
|
for i in range(len(vid)):
|
||||||
length = vid[i].shape[0]
|
length = vid[i].shape[0]
|
||||||
|
@ -21,8 +21,13 @@ import torch.nn as nn
|
|||||||
|
|
||||||
|
|
||||||
class VisualNet(torch.nn.Module):
|
class VisualNet(torch.nn.Module):
|
||||||
def __init__(self, num_classes, dropout_p=0.1):
|
def __init__(self, num_classes: int) -> None:
|
||||||
super(VisualNet, self).__init__()
|
"""
|
||||||
|
Args:
|
||||||
|
num_classes:
|
||||||
|
The output dimension of the visualnet model.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
self.num_classes = num_classes
|
self.num_classes = num_classes
|
||||||
self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2))
|
self.conv1 = nn.Conv3d(3, 32, (3, 5, 5), (1, 2, 2), (1, 2, 2))
|
||||||
self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
|
self.pool1 = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
|
||||||
@ -37,7 +42,7 @@ class VisualNet(torch.nn.Module):
|
|||||||
self.gru2 = nn.GRU(512, 256, 1, bidirectional=True)
|
self.gru2 = nn.GRU(512, 256, 1, bidirectional=True)
|
||||||
|
|
||||||
self.FC = nn.Linear(512, self.num_classes)
|
self.FC = nn.Linear(512, self.num_classes)
|
||||||
self.dropout_p = dropout_p
|
self.dropout_p = 0.5
|
||||||
|
|
||||||
self.relu = nn.ReLU(inplace=True)
|
self.relu = nn.ReLU(inplace=True)
|
||||||
self.dropout = nn.Dropout(self.dropout_p)
|
self.dropout = nn.Dropout(self.dropout_p)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user