diff --git a/egs/himia/wuw/ctc_tdnn/tdnn.py b/egs/himia/wuw/ctc_tdnn/tdnn.py new file mode 100644 index 000000000..0f685b6c2 --- /dev/null +++ b/egs/himia/wuw/ctc_tdnn/tdnn.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# Copyright 2023 Xiaomi Corp. (authors: Liyong Guo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from torch import nn, Tensor + + +class Tdnn(nn.Module): + """ + Args: + num_features (int): Number of input features + num_classes (int): Number of output classes + """ + + def __init__(self, num_features: int, num_classes: int) -> None: + super().__init__() + self.num_features = num_features + self.num_classes = num_classes + self.tdnn = nn.Sequential( + nn.Conv1d( + in_channels=num_features, + out_channels=240, + kernel_size=3, + stride=1, + padding=1, + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, out_channels=240, kernel_size=1, stride=1, padding=0 + ), + nn.ReLU(inplace=True), + nn.BatchNorm1d(num_features=240, affine=False), + nn.Conv1d( + in_channels=240, + out_channels=num_classes, + kernel_size=1, + stride=1, + padding=0, + ), + nn.LogSoftmax(1), + ) + + def forward(self, x: Tensor) -> Tensor: + r""" + Args: + x (torch.Tensor): Tensor of dimension (N, T, C). + Returns: + Tensor: Predictor tensor of dimension (N, T, C). + """ + + x = x.transpose(1, 2) + x = self.tdnn(x) + x = x.transpose(1, 2) + return x diff --git a/egs/himia/wuw/shared b/egs/himia/wuw/shared new file mode 120000 index 000000000..4cbd91a7e --- /dev/null +++ b/egs/himia/wuw/shared @@ -0,0 +1 @@ +../../../icefall/shared \ No newline at end of file