diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py index 9b2c7a19d..e93f41718 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py @@ -168,151 +168,6 @@ class BasicNorm(torch.nn.Module): return x * scales -class StructuredLinear(torch.nn.Module): - """ - This module mostly behaves like nn.Linear, but the in_features and out_features - (the number of input and output channels) are specified as tuples; the - actual numbers of channels are products over these tuples. - E.g. (2, 256) means 512, with the slowest-varying/largest-stride dims first - in terms of the layout. - For purposes of the forward() function it will behave the same as if the dim - was 512, but the parameter tensors have this structure, which makes - a difference if you are using the NeutralGradient optimizer and perhaps - certain other optimizers. - - Args: - in_features: The number of input channels, specified as - a tuple of ints (the number of input channels will be their - product). The only difference this makes is that the - nn.Parameter tensor will be shaped differently, which may - affect some optimizers. - out_features: The number of output channels, specified as - a tuple of ints. - initial_scale: The default initial parameter scale will be - multiplied by this. - bias: If true, include the bias term. - """ - def __init__(self, - in_features: Tuple[int], - out_features: Tuple[int], - bias: bool = True, - initial_scale: float = 1.0) -> None: - super(StructuredLinear, self).__init__() - self.in_features = in_features - self.out_features = out_features - in_size = reduce((lambda i,j: i*j), in_features) - out_size = reduce((lambda i,j: i*j), out_features) - self.weight_shape = (out_size, in_size) - self.weight = nn.Parameter(torch.Tensor(*out_features, *in_features)) - - if bias: - self.bias = nn.Parameter(torch.Tensor(*out_features)) - else: - self.register_parameter('bias', None) - self.reset_parameters(initial_scale) - - - def reset_parameters(self, initial_scale: float = 1.0) -> None: - nn.init.kaiming_uniform_(self.weight.reshape(*self.weight_shape), a=(5 ** 0.5)) - with torch.no_grad(): - self.weight *= initial_scale - nn.init.uniform_(self.bias, - -0.1 * initial_scale, - 0.1 * initial_scale) - - def get_weight(self) -> Tensor: - return self.weight.reshape(*self.weight_shape) - - def get_bias(self) -> Optional[Tensor]: - return (None if self.bias is None else - self.bias.reshape(self.weight_shape[0])) - - def forward(self, input: Tensor) -> Tensor: - return F.linear(input, self.get_weight(), self.get_bias()) - - def extra_repr(self) -> str: - return 'in_features={}, out_features={}, bias={}'.format( - self.in_features, self.out_features, self.bias is not None - ) - - -class StructuredConv1d(nn.Conv1d): - """ - This module mostly behaves like nn.Conv1d, but the - in_channels and out_channels are specified as tuples. For example, - 512 channels might be specified as - (2, 256), with slowest-varying/largest-stride dims first in terms of the layout. - For purposes of the forward() function it will behave the same as if the dim - was 512, but the parameter tensors have this structure, which makes - a difference if you are using the NeutralGradient optimizer. - - - Args: - in_channels: The number of input channels, specified as - a tuple of ints (the number of input channels will be their - product). The only difference this makes is that the - nn.Parameter tensor will be shaped differently, which may - affect some optimizers. - out_channels: The number of output channels, specified as - a tuple of ints. - initial_scale: The default initial parameter scale will be - multiplied by this. - bias: If true, include the bias term. - """ - def __init__( - self, - in_channels: Tuple[int], - out_channels: Tuple[int], - *args, - initial_scale: float = 1.0, - **kwargs - ): - super(StructuredConv1d, self).__init__( - reduce((lambda i,j: i*j), in_channels), - reduce((lambda i,j: i*j), out_channels), - *args, **kwargs) - - assert self.groups == 1, "Groups not supported as yet" - - self.in_channels = in_channels - self.out_channels = out_channels - - if self.transposed: - in_channels, out_channels = out_channels, in_channels - - self.weight_shape = self.weight.shape - self.weight = nn.Parameter(self.weight.detach().reshape( - *out_channels, *in_channels, *self.weight.shape[2:])) - - self.bias_shape = self.bias.shape - if self.bias is not None: - self.bias = nn.Parameter(self.bias.detach().reshape( - *out_channels)) - - # These changes in the initialization are the same as for class ScaledConv1d. - with torch.no_grad(): - self.weight[:] *= initial_scale - if self.bias is not None: - torch.nn.init.uniform_(self.bias, - -0.1 * initial_scale, - 0.1 * initial_scale) - - def get_weight(self) -> Tensor: - return self.weight.reshape(*self.weight_shape) - def get_bias(self) -> Optional[Tensor]: - return (None if self.bias is None else - self.bias.reshape(*self.bias_shape)) - - def forward(self, input: Tensor) -> Tensor: - if self.padding_mode != 'zeros': - return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode), - self.get_weight(), self.get_bias(), self.stride, - _single(0), self.dilation, self.groups) - return F.conv1d(input, self.get_weight(), self.get_bias(), self.stride, - self.padding, self.dilation, self.groups) - - - def ScaledLinear(*args, initial_scale: float = 1.0,