mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
* modify preparation * small refacor * add tedlium3 conformer_ctc2 * modify decode * filter unk in decode * add scaling converter * address comments * fix lambda function lhotse * add implicit manifest shuffle * refactor ctc_greedy_search * import model arguments from train.py * style fix * fix ci test and last style issues * update RESULTS * fix RESULTS numbers * fix label smoothing loss * update model parameters number in RESULTS
121 lines
4.0 KiB
Python
121 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2021 University of Chinese Academy of Sciences (author: Han Zhu)
|
|
# 2022 Xiaomi Corporation (author: Quandong Wang)
|
|
#
|
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import torch
|
|
from scaling import (
|
|
ActivationBalancer,
|
|
BasicNorm,
|
|
DoubleSwish,
|
|
ScaledConv2d,
|
|
ScaledLinear,
|
|
)
|
|
|
|
|
|
class Conv2dSubsampling(torch.nn.Module):
|
|
"""Convolutional 2D subsampling (to 1/4 length).
|
|
|
|
Convert an input of shape (N, T, idim) to an output
|
|
with shape (N, T', odim), where
|
|
T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
|
|
|
|
It is based on
|
|
https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py # noqa
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
in_channels: int,
|
|
out_channels: int,
|
|
layer1_channels: int = 8,
|
|
layer2_channels: int = 32,
|
|
layer3_channels: int = 128,
|
|
) -> None:
|
|
"""
|
|
Args:
|
|
in_channels:
|
|
Number of channels in. The input shape is (N, T, in_channels).
|
|
Caution: It requires: T >=7, in_channels >=7
|
|
out_channels
|
|
Output dim. The output shape is (N, ((T-1)//2 - 1)//2, out_channels)
|
|
layer1_channels:
|
|
Number of channels in layer1
|
|
layer1_channels:
|
|
Number of channels in layer2
|
|
"""
|
|
assert in_channels >= 7
|
|
super().__init__()
|
|
|
|
self.conv = torch.nn.Sequential(
|
|
ScaledConv2d(
|
|
in_channels=1,
|
|
out_channels=layer1_channels,
|
|
kernel_size=3,
|
|
padding=1,
|
|
),
|
|
ActivationBalancer(channel_dim=1),
|
|
DoubleSwish(),
|
|
ScaledConv2d(
|
|
in_channels=layer1_channels,
|
|
out_channels=layer2_channels,
|
|
kernel_size=3,
|
|
stride=2,
|
|
),
|
|
ActivationBalancer(channel_dim=1),
|
|
DoubleSwish(),
|
|
ScaledConv2d(
|
|
in_channels=layer2_channels,
|
|
out_channels=layer3_channels,
|
|
kernel_size=3,
|
|
stride=2,
|
|
),
|
|
ActivationBalancer(channel_dim=1),
|
|
DoubleSwish(),
|
|
)
|
|
self.out = ScaledLinear(
|
|
layer3_channels * (((in_channels - 1) // 2 - 1) // 2), out_channels
|
|
)
|
|
# set learn_eps=False because out_norm is preceded by `out`, and `out`
|
|
# itself has learned scale, so the extra degree of freedom is not
|
|
# needed.
|
|
self.out_norm = BasicNorm(out_channels, learn_eps=False)
|
|
# constrain median of output to be close to zero.
|
|
self.out_balancer = ActivationBalancer(
|
|
channel_dim=-1, min_positive=0.45, max_positive=0.55
|
|
)
|
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
"""Subsample x.
|
|
|
|
Args:
|
|
x:
|
|
Its shape is (N, T, idim).
|
|
|
|
Returns:
|
|
Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
|
|
"""
|
|
# On entry, x is (N, T, idim)
|
|
x = x.unsqueeze(1) # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
|
|
x = self.conv(x)
|
|
# Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2)
|
|
b, c, t, f = x.size()
|
|
x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
|
# Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
|
|
x = self.out_norm(x)
|
|
x = self.out_balancer(x)
|
|
return x
|