mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Use symlinks whenever possible
Signed-off-by: Xinyuan Li <xli257@b17.clsp.jhu.edu>
This commit is contained in:
parent
d725bad4fd
commit
8dc1ca194d
File diff suppressed because it is too large
Load Diff
1
egs/fluent_speech_commands/SLU/transducer/conformer.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/conformer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer_stateless/conformer.py
|
||||||
@ -25,7 +25,6 @@ import torch.nn as nn
|
|||||||
from transducer.slu_datamodule import SluDataModule
|
from transducer.slu_datamodule import SluDataModule
|
||||||
from transducer.beam_search import greedy_search
|
from transducer.beam_search import greedy_search
|
||||||
from transducer.decoder import Decoder
|
from transducer.decoder import Decoder
|
||||||
from transducer.encoder import Tdnn
|
|
||||||
from transducer.conformer import Conformer
|
from transducer.conformer import Conformer
|
||||||
from transducer.joiner import Joiner
|
from transducer.joiner import Joiner
|
||||||
from transducer.model import Transducer
|
from transducer.model import Transducer
|
||||||
|
|||||||
@ -1,92 +0,0 @@
|
|||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from typing import Optional, Tuple
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
|
|
||||||
class Decoder(nn.Module):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
vocab_size: int,
|
|
||||||
embedding_dim: int,
|
|
||||||
blank_id: int,
|
|
||||||
num_layers: int,
|
|
||||||
hidden_dim: int,
|
|
||||||
embedding_dropout: float = 0.0,
|
|
||||||
rnn_dropout: float = 0.0,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
vocab_size:
|
|
||||||
Number of tokens of the modeling unit.
|
|
||||||
embedding_dim:
|
|
||||||
Dimension of the input embedding.
|
|
||||||
blank_id:
|
|
||||||
The ID of the blank symbol.
|
|
||||||
num_layers:
|
|
||||||
Number of RNN layers.
|
|
||||||
hidden_dim:
|
|
||||||
Hidden dimension of RNN layers.
|
|
||||||
embedding_dropout:
|
|
||||||
Dropout rate for the embedding layer.
|
|
||||||
rnn_dropout:
|
|
||||||
Dropout for RNN layers.
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
self.embedding = nn.Embedding(
|
|
||||||
num_embeddings=vocab_size,
|
|
||||||
embedding_dim=embedding_dim,
|
|
||||||
padding_idx=blank_id,
|
|
||||||
)
|
|
||||||
self.embedding_dropout = nn.Dropout(embedding_dropout)
|
|
||||||
self.rnn = nn.LSTM(
|
|
||||||
input_size=embedding_dim,
|
|
||||||
hidden_size=hidden_dim,
|
|
||||||
num_layers=num_layers,
|
|
||||||
batch_first=True,
|
|
||||||
dropout=rnn_dropout,
|
|
||||||
)
|
|
||||||
self.blank_id = blank_id
|
|
||||||
self.output_linear = nn.Linear(hidden_dim, hidden_dim)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
y: torch.Tensor,
|
|
||||||
states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
|
||||||
) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
y:
|
|
||||||
A 2-D tensor of shape (N, U).
|
|
||||||
states:
|
|
||||||
A tuple of two tensors containing the states information of
|
|
||||||
RNN layers in this decoder.
|
|
||||||
Returns:
|
|
||||||
Return a tuple containing:
|
|
||||||
|
|
||||||
- rnn_output, a tensor of shape (N, U, C)
|
|
||||||
- (h, c), which contain the state information for RNN layers.
|
|
||||||
Both are of shape (num_layers, N, C)
|
|
||||||
"""
|
|
||||||
embedding_out = self.embedding(y)
|
|
||||||
embedding_out = self.embedding_dropout(embedding_out)
|
|
||||||
rnn_out, (h, c) = self.rnn(embedding_out, states)
|
|
||||||
out = self.output_linear(rnn_out)
|
|
||||||
|
|
||||||
return out, (h, c)
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/decoder.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/decoder.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../yesno/ASR/transducer/decoder.py
|
||||||
@ -1,87 +0,0 @@
|
|||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
|
|
||||||
# We use a TDNN model as encoder, as it works very well with CTC training
|
|
||||||
# for this tiny dataset.
|
|
||||||
class Tdnn(nn.Module):
|
|
||||||
def __init__(self, num_features: int, output_dim: int):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
num_features:
|
|
||||||
Model input dimension.
|
|
||||||
ouput_dim:
|
|
||||||
Model output dimension
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
# Note: We don't use paddings inside conv layers
|
|
||||||
self.tdnn = nn.Sequential(
|
|
||||||
nn.Conv1d(
|
|
||||||
in_channels=num_features,
|
|
||||||
out_channels=32,
|
|
||||||
kernel_size=3,
|
|
||||||
),
|
|
||||||
nn.ReLU(inplace=True),
|
|
||||||
nn.BatchNorm1d(num_features=32, affine=False),
|
|
||||||
nn.Conv1d(
|
|
||||||
in_channels=32,
|
|
||||||
out_channels=32,
|
|
||||||
kernel_size=5,
|
|
||||||
dilation=2,
|
|
||||||
),
|
|
||||||
nn.ReLU(inplace=True),
|
|
||||||
nn.BatchNorm1d(num_features=32, affine=False),
|
|
||||||
nn.Conv1d(
|
|
||||||
in_channels=32,
|
|
||||||
out_channels=32,
|
|
||||||
kernel_size=5,
|
|
||||||
dilation=4,
|
|
||||||
),
|
|
||||||
nn.ReLU(inplace=True),
|
|
||||||
nn.BatchNorm1d(num_features=32, affine=False),
|
|
||||||
)
|
|
||||||
self.output_linear = nn.Linear(in_features=32, out_features=output_dim)
|
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor, x_lens: torch.Tensor) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
The input tensor with shape (N, T, C)
|
|
||||||
x_lens:
|
|
||||||
It contains the number of frames in each utterance in x
|
|
||||||
before padding.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Return a tuple with 2 tensors:
|
|
||||||
|
|
||||||
- logits, a tensor of shape (N, T, C)
|
|
||||||
- logit_lens, a tensor of shape (N,)
|
|
||||||
"""
|
|
||||||
x = x.permute(0, 2, 1) # (N, T, C) -> (N, C, T)
|
|
||||||
x = self.tdnn(x)
|
|
||||||
x = x.permute(0, 2, 1) # (N, C, T) -> (N, T, C)
|
|
||||||
logits = self.output_linear(x)
|
|
||||||
|
|
||||||
# the first conv layer reduces T by 3-1 frames
|
|
||||||
# the second layer reduces T by (5-1)*2 frames
|
|
||||||
# the second layer reduces T by (5-1)*4 frames
|
|
||||||
# Number of output frames is 2 + 4*2 + 4*4 = 2 + 8 + 16 = 26
|
|
||||||
x_lens = x_lens - 26
|
|
||||||
return logits, x_lens
|
|
||||||
@ -1,43 +0,0 @@
|
|||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
|
|
||||||
class EncoderInterface(nn.Module):
|
|
||||||
def forward(
|
|
||||||
self, x: torch.Tensor, x_lens: torch.Tensor
|
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
A tensor of shape (batch_size, input_seq_len, num_features)
|
|
||||||
containing the input features.
|
|
||||||
x_lens:
|
|
||||||
A tensor of shape (batch_size,) containing the number of frames
|
|
||||||
in `x` before padding.
|
|
||||||
Returns:
|
|
||||||
Return a tuple containing two tensors:
|
|
||||||
- encoder_out, a tensor of (batch_size, out_seq_len, output_dim)
|
|
||||||
containing unnormalized probabilities, i.e., the output of a
|
|
||||||
linear layer.
|
|
||||||
- encoder_out_lens, a tensor of shape (batch_size,) containing
|
|
||||||
the number of frames in `encoder_out` before padding.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("Please implement it in a subclass")
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer_stateless/encoder_interface.py
|
||||||
@ -1,55 +0,0 @@
|
|||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
import torch.nn.functional as F
|
|
||||||
|
|
||||||
|
|
||||||
class Joiner(nn.Module):
|
|
||||||
def __init__(self, input_dim: int, output_dim: int):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.output_linear = nn.Linear(input_dim, output_dim)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self, encoder_out: torch.Tensor, decoder_out: torch.Tensor
|
|
||||||
) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
encoder_out:
|
|
||||||
Output from the encoder. Its shape is (N, T, C).
|
|
||||||
decoder_out:
|
|
||||||
Output from the decoder. Its shape is (N, U, C).
|
|
||||||
Returns:
|
|
||||||
Return a tensor of shape (N, T, U, C).
|
|
||||||
"""
|
|
||||||
assert encoder_out.ndim == decoder_out.ndim == 3
|
|
||||||
assert encoder_out.size(0) == decoder_out.size(0)
|
|
||||||
assert encoder_out.size(2) == decoder_out.size(2)
|
|
||||||
|
|
||||||
encoder_out = encoder_out.unsqueeze(2)
|
|
||||||
# Now encoder_out is (N, T, 1, C)
|
|
||||||
|
|
||||||
decoder_out = decoder_out.unsqueeze(1)
|
|
||||||
# Now decoder_out is (N, 1, U, C)
|
|
||||||
|
|
||||||
logit = encoder_out + decoder_out
|
|
||||||
logit = F.relu(logit)
|
|
||||||
|
|
||||||
output = self.output_linear(logit)
|
|
||||||
|
|
||||||
return output
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/joiner.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/joiner.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer/joiner.py
|
||||||
@ -1,120 +0,0 @@
|
|||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Note we use `rnnt_loss` from torchaudio, which exists only in
|
|
||||||
torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0
|
|
||||||
"""
|
|
||||||
import k2
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
import torchaudio
|
|
||||||
import torchaudio.functional
|
|
||||||
|
|
||||||
from icefall.utils import add_sos
|
|
||||||
|
|
||||||
assert hasattr(torchaudio.functional, "rnnt_loss"), (
|
|
||||||
f"Current torchaudio version: {torchaudio.__version__}\n"
|
|
||||||
"Please install a version >= 0.10.0"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Transducer(nn.Module):
|
|
||||||
"""It implements https://arxiv.org/pdf/1211.3711.pdf
|
|
||||||
"Sequence Transduction with Recurrent Neural Networks"
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
encoder: nn.Module,
|
|
||||||
decoder: nn.Module,
|
|
||||||
joiner: nn.Module,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
encoder:
|
|
||||||
It is the transcription network in the paper. Its accepts
|
|
||||||
two inputs: `x` of (N, T, C) and `x_lens` of shape (N,).
|
|
||||||
It returns two tensors: `logits` of shape (N, T, C) and
|
|
||||||
`logit_lens` of shape (N,).
|
|
||||||
decoder:
|
|
||||||
It is the prediction network in the paper. Its input shape
|
|
||||||
is (N, U) and its output shape is (N, U, C). It should contain
|
|
||||||
one attribute: `blank_id`.
|
|
||||||
joiner:
|
|
||||||
It has two inputs with shapes: (N, T, C) and (N, U, C). Its
|
|
||||||
output shape is (N, T, U, C). Note that its output contains
|
|
||||||
unnormalized probs, i.e., not processed by log-softmax.
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
self.encoder = encoder
|
|
||||||
self.decoder = decoder
|
|
||||||
self.joiner = joiner
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
x: torch.Tensor,
|
|
||||||
x_lens: torch.Tensor,
|
|
||||||
y: k2.RaggedTensor,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
A 3-D tensor of shape (N, T, C).
|
|
||||||
x_lens:
|
|
||||||
A 1-D tensor of shape (N,). It contains the number of frames in `x`
|
|
||||||
before padding.
|
|
||||||
y:
|
|
||||||
A ragged tensor with 2 axes [utt][label]. It contains labels of each
|
|
||||||
utterance.
|
|
||||||
Returns:
|
|
||||||
Return the transducer loss.
|
|
||||||
"""
|
|
||||||
assert x.ndim == 3, x.shape
|
|
||||||
assert x_lens.ndim == 1, x_lens.shape
|
|
||||||
assert y.num_axes == 2, y.num_axes
|
|
||||||
|
|
||||||
assert x.size(0) == x_lens.size(0) == y.dim0
|
|
||||||
|
|
||||||
encoder_out, x_lens = self.encoder(x, x_lens)
|
|
||||||
assert torch.all(x_lens > 0)
|
|
||||||
|
|
||||||
# Now for the decoder, i.e., the prediction network
|
|
||||||
row_splits = y.shape.row_splits(1)
|
|
||||||
y_lens = row_splits[1:] - row_splits[:-1]
|
|
||||||
|
|
||||||
blank_id = self.decoder.blank_id
|
|
||||||
sos_y = add_sos(y, sos_id=blank_id)
|
|
||||||
|
|
||||||
sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
|
|
||||||
|
|
||||||
decoder_out, _ = self.decoder(sos_y_padded)
|
|
||||||
|
|
||||||
logits = self.joiner(encoder_out, decoder_out)
|
|
||||||
|
|
||||||
# rnnt_loss requires 0 padded targets
|
|
||||||
y_padded = y.pad(mode="constant", padding_value=0)
|
|
||||||
|
|
||||||
loss = torchaudio.functional.rnnt_loss(
|
|
||||||
logits=logits,
|
|
||||||
targets=y_padded,
|
|
||||||
logit_lengths=x_lens,
|
|
||||||
target_lengths=y_lens,
|
|
||||||
blank=blank_id,
|
|
||||||
reduction="mean",
|
|
||||||
)
|
|
||||||
|
|
||||||
return loss
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/model.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/model.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer/model.py
|
||||||
@ -1,153 +0,0 @@
|
|||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
|
|
||||||
class Conv2dSubsampling(nn.Module):
|
|
||||||
"""Convolutional 2D subsampling (to 1/4 length).
|
|
||||||
|
|
||||||
Convert an input of shape (N, T, idim) to an output
|
|
||||||
with shape (N, T', odim), where
|
|
||||||
T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
|
|
||||||
|
|
||||||
It is based on
|
|
||||||
https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py # noqa
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, idim: int, odim: int) -> None:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
idim:
|
|
||||||
Input dim. The input shape is (N, T, idim).
|
|
||||||
Caution: It requires: T >=7, idim >=7
|
|
||||||
odim:
|
|
||||||
Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
|
|
||||||
"""
|
|
||||||
assert idim >= 7
|
|
||||||
super().__init__()
|
|
||||||
self.conv = nn.Sequential(
|
|
||||||
nn.Conv2d(in_channels=1, out_channels=odim, kernel_size=3, stride=2),
|
|
||||||
nn.ReLU(),
|
|
||||||
nn.Conv2d(in_channels=odim, out_channels=odim, kernel_size=3, stride=2),
|
|
||||||
nn.ReLU(),
|
|
||||||
)
|
|
||||||
self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
|
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
||||||
"""Subsample x.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
Its shape is (N, T, idim).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
|
|
||||||
"""
|
|
||||||
# On entry, x is (N, T, idim)
|
|
||||||
x = x.unsqueeze(1) # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
|
|
||||||
x = self.conv(x)
|
|
||||||
# Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2)
|
|
||||||
b, c, t, f = x.size()
|
|
||||||
x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
|
||||||
# Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class VggSubsampling(nn.Module):
|
|
||||||
"""Trying to follow the setup described in the following paper:
|
|
||||||
https://arxiv.org/pdf/1910.09799.pdf
|
|
||||||
|
|
||||||
This paper is not 100% explicit so I am guessing to some extent,
|
|
||||||
and trying to compare with other VGG implementations.
|
|
||||||
|
|
||||||
Convert an input of shape (N, T, idim) to an output
|
|
||||||
with shape (N, T', odim), where
|
|
||||||
T' = ((T-1)//2 - 1)//2, which approximates T' = T//4
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, idim: int, odim: int) -> None:
|
|
||||||
"""Construct a VggSubsampling object.
|
|
||||||
|
|
||||||
This uses 2 VGG blocks with 2 Conv2d layers each,
|
|
||||||
subsampling its input by a factor of 4 in the time dimensions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
idim:
|
|
||||||
Input dim. The input shape is (N, T, idim).
|
|
||||||
Caution: It requires: T >=7, idim >=7
|
|
||||||
odim:
|
|
||||||
Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
cur_channels = 1
|
|
||||||
layers = []
|
|
||||||
block_dims = [32, 64]
|
|
||||||
|
|
||||||
# The decision to use padding=1 for the 1st convolution, then padding=0
|
|
||||||
# for the 2nd and for the max-pooling, and ceil_mode=True, was driven by
|
|
||||||
# a back-compatibility concern so that the number of frames at the
|
|
||||||
# output would be equal to:
|
|
||||||
# (((T-1)//2)-1)//2.
|
|
||||||
# We can consider changing this by using padding=1 on the
|
|
||||||
# 2nd convolution, so the num-frames at the output would be T//4.
|
|
||||||
for block_dim in block_dims:
|
|
||||||
layers.append(
|
|
||||||
torch.nn.Conv2d(
|
|
||||||
in_channels=cur_channels,
|
|
||||||
out_channels=block_dim,
|
|
||||||
kernel_size=3,
|
|
||||||
padding=1,
|
|
||||||
stride=1,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
layers.append(torch.nn.ReLU())
|
|
||||||
layers.append(
|
|
||||||
torch.nn.Conv2d(
|
|
||||||
in_channels=block_dim,
|
|
||||||
out_channels=block_dim,
|
|
||||||
kernel_size=3,
|
|
||||||
padding=0,
|
|
||||||
stride=1,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
layers.append(
|
|
||||||
torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
|
||||||
)
|
|
||||||
cur_channels = block_dim
|
|
||||||
|
|
||||||
self.layers = nn.Sequential(*layers)
|
|
||||||
|
|
||||||
self.out = nn.Linear(block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim)
|
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
||||||
"""Subsample x.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
Its shape is (N, T, idim).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
|
|
||||||
"""
|
|
||||||
x = x.unsqueeze(1)
|
|
||||||
x = self.layers(x)
|
|
||||||
b, c, t, f = x.size()
|
|
||||||
x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
|
||||||
return x
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/subsampling.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/subsampling.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer_stateless/subsampling.py
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/test_conformer.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/test_conformer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer/test_conformer.py
|
||||||
@ -1,65 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
To run this file, do:
|
|
||||||
|
|
||||||
cd icefall/egs/yesno/ASR
|
|
||||||
python ./transducer/test_decoder.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from transducer.decoder import Decoder
|
|
||||||
|
|
||||||
|
|
||||||
def test_decoder():
|
|
||||||
vocab_size = 3
|
|
||||||
blank_id = 0
|
|
||||||
embedding_dim = 128
|
|
||||||
num_layers = 2
|
|
||||||
hidden_dim = 6
|
|
||||||
N = 3
|
|
||||||
U = 5
|
|
||||||
|
|
||||||
decoder = Decoder(
|
|
||||||
vocab_size=vocab_size,
|
|
||||||
embedding_dim=embedding_dim,
|
|
||||||
blank_id=blank_id,
|
|
||||||
num_layers=num_layers,
|
|
||||||
hidden_dim=hidden_dim,
|
|
||||||
embedding_dropout=0.0,
|
|
||||||
rnn_dropout=0.0,
|
|
||||||
)
|
|
||||||
x = torch.randint(1, vocab_size, (N, U))
|
|
||||||
rnn_out, (h, c) = decoder(x)
|
|
||||||
|
|
||||||
assert rnn_out.shape == (N, U, hidden_dim)
|
|
||||||
assert h.shape == (num_layers, N, hidden_dim)
|
|
||||||
assert c.shape == (num_layers, N, hidden_dim)
|
|
||||||
|
|
||||||
rnn_out, (h, c) = decoder(x, (h, c))
|
|
||||||
assert rnn_out.shape == (N, U, hidden_dim)
|
|
||||||
assert h.shape == (num_layers, N, hidden_dim)
|
|
||||||
assert c.shape == (num_layers, N, hidden_dim)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
test_decoder()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/test_decoder.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/test_decoder.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../yesno/ASR/transducer/test_decoder.py
|
||||||
@ -1,47 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
To run this file, do:
|
|
||||||
|
|
||||||
cd icefall/egs/yesno/ASR
|
|
||||||
python ./transducer/test_encoder.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from transducer.encoder import Tdnn
|
|
||||||
|
|
||||||
|
|
||||||
def test_encoder():
|
|
||||||
input_dim = 10
|
|
||||||
output_dim = 20
|
|
||||||
encoder = Tdnn(input_dim, output_dim)
|
|
||||||
N = 10
|
|
||||||
T = 85
|
|
||||||
x = torch.rand(N, T, input_dim)
|
|
||||||
x_lens = torch.randint(low=30, high=T, size=(N,), dtype=torch.int32)
|
|
||||||
logits, logit_lens = encoder(x, x_lens)
|
|
||||||
assert logits.shape == (N, T - 26, output_dim)
|
|
||||||
assert torch.all(torch.eq(x_lens - 26, logit_lens))
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
test_encoder()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
To run this file, do:
|
|
||||||
|
|
||||||
cd icefall/egs/yesno/ASR
|
|
||||||
python ./transducer/test_joiner.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from transducer.joiner import Joiner
|
|
||||||
|
|
||||||
|
|
||||||
def test_joiner():
|
|
||||||
N = 2
|
|
||||||
T = 3
|
|
||||||
C = 4
|
|
||||||
U = 5
|
|
||||||
|
|
||||||
joiner = Joiner(C, 10)
|
|
||||||
|
|
||||||
encoder_out = torch.rand(N, T, C)
|
|
||||||
decoder_out = torch.rand(N, U, C)
|
|
||||||
|
|
||||||
joint = joiner(encoder_out, decoder_out)
|
|
||||||
assert joint.shape == (N, T, U, 10)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
test_joiner()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/test_joiner.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/test_joiner.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer/test_joiner.py
|
||||||
@ -1,77 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
To run this file, do:
|
|
||||||
|
|
||||||
cd icefall/egs/yesno/ASR
|
|
||||||
python ./transducer/test_transducer.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
import k2
|
|
||||||
import torch
|
|
||||||
from transducer.decoder import Decoder
|
|
||||||
from transducer.encoder import Tdnn
|
|
||||||
from transducer.joiner import Joiner
|
|
||||||
from transducer.model import Transducer
|
|
||||||
|
|
||||||
|
|
||||||
def test_transducer():
|
|
||||||
# encoder params
|
|
||||||
input_dim = 10
|
|
||||||
output_dim = 20
|
|
||||||
|
|
||||||
# decoder params
|
|
||||||
vocab_size = 3
|
|
||||||
blank_id = 0
|
|
||||||
embedding_dim = 128
|
|
||||||
num_layers = 2
|
|
||||||
|
|
||||||
encoder = Tdnn(input_dim, output_dim)
|
|
||||||
|
|
||||||
decoder = Decoder(
|
|
||||||
vocab_size=vocab_size,
|
|
||||||
embedding_dim=embedding_dim,
|
|
||||||
blank_id=blank_id,
|
|
||||||
num_layers=num_layers,
|
|
||||||
hidden_dim=output_dim,
|
|
||||||
embedding_dropout=0.0,
|
|
||||||
rnn_dropout=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
joiner = Joiner(output_dim, vocab_size)
|
|
||||||
transducer = Transducer(encoder=encoder, decoder=decoder, joiner=joiner)
|
|
||||||
|
|
||||||
y = k2.RaggedTensor([[1, 2, 1], [1, 1, 1, 2, 1]])
|
|
||||||
N = y.dim0
|
|
||||||
T = 50
|
|
||||||
|
|
||||||
x = torch.rand(N, T, input_dim)
|
|
||||||
x_lens = torch.randint(low=30, high=T, size=(N,), dtype=torch.int32)
|
|
||||||
x_lens[0] = T
|
|
||||||
|
|
||||||
loss = transducer(x, x_lens, y)
|
|
||||||
print(loss)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
test_transducer()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/test_transducer.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/test_transducer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer/test_transducer.py
|
||||||
@ -33,7 +33,6 @@ from torch.nn.parallel import DistributedDataParallel as DDP
|
|||||||
from torch.nn.utils import clip_grad_norm_
|
from torch.nn.utils import clip_grad_norm_
|
||||||
# from torch.utils.tensorboard import SummaryWriter
|
# from torch.utils.tensorboard import SummaryWriter
|
||||||
from transducer.decoder import Decoder
|
from transducer.decoder import Decoder
|
||||||
from transducer.encoder import Tdnn
|
|
||||||
from transducer.conformer import Conformer
|
from transducer.conformer import Conformer
|
||||||
from transducer.joiner import Joiner
|
from transducer.joiner import Joiner
|
||||||
from transducer.model import Transducer
|
from transducer.model import Transducer
|
||||||
@ -492,10 +491,6 @@ def train_one_epoch(
|
|||||||
|
|
||||||
|
|
||||||
def get_transducer_model(params: AttributeDict):
|
def get_transducer_model(params: AttributeDict):
|
||||||
# encoder = Tdnn(
|
|
||||||
# num_features=params.feature_dim,
|
|
||||||
# output_dim=params.hidden_dim,
|
|
||||||
# )
|
|
||||||
encoder = Conformer(
|
encoder = Conformer(
|
||||||
num_features=params.feature_dim,
|
num_features=params.feature_dim,
|
||||||
output_dim=params.hidden_dim,
|
output_dim=params.hidden_dim,
|
||||||
|
|||||||
@ -1,416 +0,0 @@
|
|||||||
# Copyright 2021 University of Chinese Academy of Sciences (author: Han Zhu)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
import math
|
|
||||||
from typing import Optional, Tuple
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
from transducer.encoder_interface import EncoderInterface
|
|
||||||
from transducer.subsampling import Conv2dSubsampling, VggSubsampling
|
|
||||||
|
|
||||||
from icefall.utils import make_pad_mask
|
|
||||||
|
|
||||||
|
|
||||||
class Transformer(EncoderInterface):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
num_features: int,
|
|
||||||
output_dim: int,
|
|
||||||
subsampling_factor: int = 4,
|
|
||||||
d_model: int = 256,
|
|
||||||
nhead: int = 4,
|
|
||||||
dim_feedforward: int = 2048,
|
|
||||||
num_encoder_layers: int = 12,
|
|
||||||
dropout: float = 0.1,
|
|
||||||
normalize_before: bool = True,
|
|
||||||
vgg_frontend: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
num_features:
|
|
||||||
The input dimension of the model.
|
|
||||||
output_dim:
|
|
||||||
The output dimension of the model.
|
|
||||||
subsampling_factor:
|
|
||||||
Number of output frames is num_in_frames // subsampling_factor.
|
|
||||||
Currently, subsampling_factor MUST be 4.
|
|
||||||
d_model:
|
|
||||||
Attention dimension.
|
|
||||||
nhead:
|
|
||||||
Number of heads in multi-head attention.
|
|
||||||
Must satisfy d_model // nhead == 0.
|
|
||||||
dim_feedforward:
|
|
||||||
The output dimension of the feedforward layers in encoder.
|
|
||||||
num_encoder_layers:
|
|
||||||
Number of encoder layers.
|
|
||||||
dropout:
|
|
||||||
Dropout in encoder.
|
|
||||||
normalize_before:
|
|
||||||
If True, use pre-layer norm; False to use post-layer norm.
|
|
||||||
vgg_frontend:
|
|
||||||
True to use vgg style frontend for subsampling.
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.num_features = num_features
|
|
||||||
self.output_dim = output_dim
|
|
||||||
self.subsampling_factor = subsampling_factor
|
|
||||||
if subsampling_factor != 4:
|
|
||||||
raise NotImplementedError("Support only 'subsampling_factor=4'.")
|
|
||||||
|
|
||||||
# self.encoder_embed converts the input of shape (N, T, num_features)
|
|
||||||
# to the shape (N, T//subsampling_factor, d_model).
|
|
||||||
# That is, it does two things simultaneously:
|
|
||||||
# (1) subsampling: T -> T//subsampling_factor
|
|
||||||
# (2) embedding: num_features -> d_model
|
|
||||||
if vgg_frontend:
|
|
||||||
self.encoder_embed = VggSubsampling(num_features, d_model)
|
|
||||||
else:
|
|
||||||
self.encoder_embed = Conv2dSubsampling(num_features, d_model)
|
|
||||||
|
|
||||||
self.encoder_pos = PositionalEncoding(d_model, dropout)
|
|
||||||
|
|
||||||
encoder_layer = TransformerEncoderLayer(
|
|
||||||
d_model=d_model,
|
|
||||||
nhead=nhead,
|
|
||||||
dim_feedforward=dim_feedforward,
|
|
||||||
dropout=dropout,
|
|
||||||
normalize_before=normalize_before,
|
|
||||||
)
|
|
||||||
|
|
||||||
if normalize_before:
|
|
||||||
encoder_norm = nn.LayerNorm(d_model)
|
|
||||||
else:
|
|
||||||
encoder_norm = None
|
|
||||||
|
|
||||||
self.encoder = nn.TransformerEncoder(
|
|
||||||
encoder_layer=encoder_layer,
|
|
||||||
num_layers=num_encoder_layers,
|
|
||||||
norm=encoder_norm,
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO(fangjun): remove dropout
|
|
||||||
self.encoder_output_layer = nn.Sequential(
|
|
||||||
nn.Dropout(p=dropout), nn.Linear(d_model, output_dim)
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self, x: torch.Tensor, x_lens: torch.Tensor
|
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
The input tensor. Its shape is (batch_size, seq_len, feature_dim).
|
|
||||||
x_lens:
|
|
||||||
A tensor of shape (batch_size,) containing the number of frames in
|
|
||||||
`x` before padding.
|
|
||||||
Returns:
|
|
||||||
Return a tuple containing 2 tensors:
|
|
||||||
- logits, its shape is (batch_size, output_seq_len, output_dim)
|
|
||||||
- logit_lens, a tensor of shape (batch_size,) containing the number
|
|
||||||
of frames in `logits` before padding.
|
|
||||||
"""
|
|
||||||
x = self.encoder_embed(x)
|
|
||||||
x = self.encoder_pos(x)
|
|
||||||
x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C)
|
|
||||||
|
|
||||||
# Caution: We assume the subsampling factor is 4!
|
|
||||||
lengths = ((x_lens - 1) // 2 - 1) // 2
|
|
||||||
assert x.size(0) == lengths.max().item()
|
|
||||||
|
|
||||||
mask = make_pad_mask(lengths)
|
|
||||||
x = self.encoder(x, src_key_padding_mask=mask) # (T, N, C)
|
|
||||||
|
|
||||||
logits = self.encoder_output_layer(x)
|
|
||||||
logits = logits.permute(1, 0, 2) # (T, N, C) ->(N, T, C)
|
|
||||||
|
|
||||||
return logits, lengths
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerEncoderLayer(nn.Module):
|
|
||||||
"""
|
|
||||||
Modified from torch.nn.TransformerEncoderLayer.
|
|
||||||
Add support of normalize_before,
|
|
||||||
i.e., use layer_norm before the first block.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
d_model:
|
|
||||||
the number of expected features in the input (required).
|
|
||||||
nhead:
|
|
||||||
the number of heads in the multiheadattention models (required).
|
|
||||||
dim_feedforward:
|
|
||||||
the dimension of the feedforward network model (default=2048).
|
|
||||||
dropout:
|
|
||||||
the dropout value (default=0.1).
|
|
||||||
activation:
|
|
||||||
the activation function of intermediate layer, relu or
|
|
||||||
gelu (default=relu).
|
|
||||||
normalize_before:
|
|
||||||
whether to use layer_norm before the first block.
|
|
||||||
|
|
||||||
Examples::
|
|
||||||
>>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
|
|
||||||
>>> src = torch.rand(10, 32, 512)
|
|
||||||
>>> out = encoder_layer(src)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
d_model: int,
|
|
||||||
nhead: int,
|
|
||||||
dim_feedforward: int = 2048,
|
|
||||||
dropout: float = 0.1,
|
|
||||||
activation: str = "relu",
|
|
||||||
normalize_before: bool = True,
|
|
||||||
) -> None:
|
|
||||||
super(TransformerEncoderLayer, self).__init__()
|
|
||||||
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
|
|
||||||
# Implementation of Feedforward model
|
|
||||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
|
||||||
self.dropout = nn.Dropout(dropout)
|
|
||||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
|
||||||
|
|
||||||
self.norm1 = nn.LayerNorm(d_model)
|
|
||||||
self.norm2 = nn.LayerNorm(d_model)
|
|
||||||
self.dropout1 = nn.Dropout(dropout)
|
|
||||||
self.dropout2 = nn.Dropout(dropout)
|
|
||||||
|
|
||||||
self.activation = _get_activation_fn(activation)
|
|
||||||
|
|
||||||
self.normalize_before = normalize_before
|
|
||||||
|
|
||||||
def __setstate__(self, state):
|
|
||||||
if "activation" not in state:
|
|
||||||
state["activation"] = nn.functional.relu
|
|
||||||
super(TransformerEncoderLayer, self).__setstate__(state)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
src: torch.Tensor,
|
|
||||||
src_mask: Optional[torch.Tensor] = None,
|
|
||||||
src_key_padding_mask: Optional[torch.Tensor] = None,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Pass the input through the encoder layer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
src: the sequence to the encoder layer (required).
|
|
||||||
src_mask: the mask for the src sequence (optional).
|
|
||||||
src_key_padding_mask: the mask for the src keys per batch (optional)
|
|
||||||
|
|
||||||
Shape:
|
|
||||||
src: (S, N, E).
|
|
||||||
src_mask: (S, S).
|
|
||||||
src_key_padding_mask: (N, S).
|
|
||||||
S is the source sequence length, T is the target sequence length,
|
|
||||||
N is the batch size, E is the feature number
|
|
||||||
"""
|
|
||||||
residual = src
|
|
||||||
if self.normalize_before:
|
|
||||||
src = self.norm1(src)
|
|
||||||
src2 = self.self_attn(
|
|
||||||
src,
|
|
||||||
src,
|
|
||||||
src,
|
|
||||||
attn_mask=src_mask,
|
|
||||||
key_padding_mask=src_key_padding_mask,
|
|
||||||
)[0]
|
|
||||||
src = residual + self.dropout1(src2)
|
|
||||||
if not self.normalize_before:
|
|
||||||
src = self.norm1(src)
|
|
||||||
|
|
||||||
residual = src
|
|
||||||
if self.normalize_before:
|
|
||||||
src = self.norm2(src)
|
|
||||||
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
|
||||||
src = residual + self.dropout2(src2)
|
|
||||||
if not self.normalize_before:
|
|
||||||
src = self.norm2(src)
|
|
||||||
return src
|
|
||||||
|
|
||||||
|
|
||||||
def _get_activation_fn(activation: str):
|
|
||||||
if activation == "relu":
|
|
||||||
return nn.functional.relu
|
|
||||||
elif activation == "gelu":
|
|
||||||
return nn.functional.gelu
|
|
||||||
|
|
||||||
raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
|
|
||||||
|
|
||||||
|
|
||||||
class PositionalEncoding(nn.Module):
|
|
||||||
"""This class implements the positional encoding
|
|
||||||
proposed in the following paper:
|
|
||||||
|
|
||||||
- Attention Is All You Need: https://arxiv.org/pdf/1706.03762.pdf
|
|
||||||
|
|
||||||
PE(pos, 2i) = sin(pos / (10000^(2i/d_modle))
|
|
||||||
PE(pos, 2i+1) = cos(pos / (10000^(2i/d_modle))
|
|
||||||
|
|
||||||
Note::
|
|
||||||
|
|
||||||
1 / (10000^(2i/d_model)) = exp(-log(10000^(2i/d_model)))
|
|
||||||
= exp(-1* 2i / d_model * log(100000))
|
|
||||||
= exp(2i * -(log(10000) / d_model))
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, d_model: int, dropout: float = 0.1) -> None:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
d_model:
|
|
||||||
Embedding dimension.
|
|
||||||
dropout:
|
|
||||||
Dropout probability to be applied to the output of this module.
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
self.d_model = d_model
|
|
||||||
self.xscale = math.sqrt(self.d_model)
|
|
||||||
self.dropout = nn.Dropout(p=dropout)
|
|
||||||
# not doing: self.pe = None because of errors thrown by torchscript
|
|
||||||
self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32)
|
|
||||||
|
|
||||||
def extend_pe(self, x: torch.Tensor) -> None:
|
|
||||||
"""Extend the time t in the positional encoding if required.
|
|
||||||
|
|
||||||
The shape of `self.pe` is (1, T1, d_model). The shape of the input x
|
|
||||||
is (N, T, d_model). If T > T1, then we change the shape of self.pe
|
|
||||||
to (N, T, d_model). Otherwise, nothing is done.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
It is a tensor of shape (N, T, C).
|
|
||||||
Returns:
|
|
||||||
Return None.
|
|
||||||
"""
|
|
||||||
if self.pe is not None:
|
|
||||||
if self.pe.size(1) >= x.size(1):
|
|
||||||
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
|
|
||||||
return
|
|
||||||
pe = torch.zeros(x.size(1), self.d_model, dtype=torch.float32)
|
|
||||||
position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
|
|
||||||
div_term = torch.exp(
|
|
||||||
torch.arange(0, self.d_model, 2, dtype=torch.float32)
|
|
||||||
* -(math.log(10000.0) / self.d_model)
|
|
||||||
)
|
|
||||||
pe[:, 0::2] = torch.sin(position * div_term)
|
|
||||||
pe[:, 1::2] = torch.cos(position * div_term)
|
|
||||||
pe = pe.unsqueeze(0)
|
|
||||||
# Now pe is of shape (1, T, d_model), where T is x.size(1)
|
|
||||||
self.pe = pe.to(device=x.device, dtype=x.dtype)
|
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Add positional encoding.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x:
|
|
||||||
Its shape is (N, T, C)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Return a tensor of shape (N, T, C)
|
|
||||||
"""
|
|
||||||
self.extend_pe(x)
|
|
||||||
x = x * self.xscale + self.pe[:, : x.size(1), :]
|
|
||||||
return self.dropout(x)
|
|
||||||
|
|
||||||
|
|
||||||
class Noam(object):
|
|
||||||
"""
|
|
||||||
Implements Noam optimizer.
|
|
||||||
|
|
||||||
Proposed in
|
|
||||||
"Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
|
|
||||||
|
|
||||||
Modified from
|
|
||||||
https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py # noqa
|
|
||||||
|
|
||||||
Args:
|
|
||||||
params:
|
|
||||||
iterable of parameters to optimize or dicts defining parameter groups
|
|
||||||
model_size:
|
|
||||||
attention dimension of the transformer model
|
|
||||||
factor:
|
|
||||||
learning rate factor
|
|
||||||
warm_step:
|
|
||||||
warmup steps
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
params,
|
|
||||||
model_size: int = 256,
|
|
||||||
factor: float = 10.0,
|
|
||||||
warm_step: int = 25000,
|
|
||||||
weight_decay=0,
|
|
||||||
) -> None:
|
|
||||||
"""Construct an Noam object."""
|
|
||||||
self.optimizer = torch.optim.Adam(
|
|
||||||
params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay
|
|
||||||
)
|
|
||||||
self._step = 0
|
|
||||||
self.warmup = warm_step
|
|
||||||
self.factor = factor
|
|
||||||
self.model_size = model_size
|
|
||||||
self._rate = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def param_groups(self):
|
|
||||||
"""Return param_groups."""
|
|
||||||
return self.optimizer.param_groups
|
|
||||||
|
|
||||||
def step(self):
|
|
||||||
"""Update parameters and rate."""
|
|
||||||
self._step += 1
|
|
||||||
rate = self.rate()
|
|
||||||
for p in self.optimizer.param_groups:
|
|
||||||
p["lr"] = rate
|
|
||||||
self._rate = rate
|
|
||||||
self.optimizer.step()
|
|
||||||
|
|
||||||
def rate(self, step=None):
|
|
||||||
"""Implement `lrate` above."""
|
|
||||||
if step is None:
|
|
||||||
step = self._step
|
|
||||||
return (
|
|
||||||
self.factor
|
|
||||||
* self.model_size ** (-0.5)
|
|
||||||
* min(step ** (-0.5), step * self.warmup ** (-1.5))
|
|
||||||
)
|
|
||||||
|
|
||||||
def zero_grad(self):
|
|
||||||
"""Reset gradient."""
|
|
||||||
self.optimizer.zero_grad()
|
|
||||||
|
|
||||||
def state_dict(self):
|
|
||||||
"""Return state_dict."""
|
|
||||||
return {
|
|
||||||
"_step": self._step,
|
|
||||||
"warmup": self.warmup,
|
|
||||||
"factor": self.factor,
|
|
||||||
"model_size": self.model_size,
|
|
||||||
"_rate": self._rate,
|
|
||||||
"optimizer": self.optimizer.state_dict(),
|
|
||||||
}
|
|
||||||
|
|
||||||
def load_state_dict(self, state_dict):
|
|
||||||
"""Load state_dict."""
|
|
||||||
for key, value in state_dict.items():
|
|
||||||
if key == "optimizer":
|
|
||||||
self.optimizer.load_state_dict(state_dict["optimizer"])
|
|
||||||
else:
|
|
||||||
setattr(self, key, value)
|
|
||||||
1
egs/fluent_speech_commands/SLU/transducer/transformer.py
Symbolic link
1
egs/fluent_speech_commands/SLU/transducer/transformer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/transducer_stateless/transformer.py
|
||||||
Loading…
x
Reference in New Issue
Block a user