Use symlinks whenever possible

Signed-off-by: Xinyuan Li <xli257@b17.clsp.jhu.edu>
2025-12-11 06:55:27 +00:00 · 2024-01-23 23:21:37 -05:00 · 2024-01-23 23:21:37 -05:00 · 8dc1ca194d
commit 8dc1ca194d
parent d725bad4fd
15 changed files with 11 additions and 2656 deletions
--- a/egs/fluent_speech_commands/SLU/transducer/conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/conformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/decode.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decode.py
@ -25,7 +25,6 @@ import torch.nn as nn
 from transducer.slu_datamodule import SluDataModule
 from transducer.beam_search import greedy_search
 from transducer.decoder import Decoder
 from transducer.encoder import Tdnn
 from transducer.conformer import Conformer
 from transducer.joiner import Joiner
 from transducer.model import Transducer
--- a/egs/fluent_speech_commands/SLU/transducer/decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decoder.py
@ -1,92 +0,0 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 class Decoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        blank_id: int,
        num_layers: int,
        hidden_dim: int,
        embedding_dropout: float = 0.0,
        rnn_dropout: float = 0.0,
    ):
        """
        Args:
          vocab_size:
            Number of tokens of the modeling unit.
          embedding_dim:
            Dimension of the input embedding.
          blank_id:
            The ID of the blank symbol.
          num_layers:
            Number of RNN layers.
          hidden_dim:
            Hidden dimension of RNN layers.
          embedding_dropout:
            Dropout rate for the embedding layer.
          rnn_dropout:
            Dropout for RNN layers.
        """
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=blank_id,
        )
        self.embedding_dropout = nn.Dropout(embedding_dropout)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=rnn_dropout,
        )
        self.blank_id = blank_id
        self.output_linear = nn.Linear(hidden_dim, hidden_dim)
    def forward(
        self,
        y: torch.Tensor,
        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Args:
          y:
            A 2-D tensor of shape (N, U).
          states:
            A tuple of two tensors containing the states information of
            RNN layers in this decoder.
        Returns:
          Return a tuple containing:
            - rnn_output, a tensor of shape (N, U, C)
            - (h, c), which contain the state information for RNN layers.
              Both are of shape (num_layers, N, C)
        """
        embedding_out = self.embedding(y)
        embedding_out = self.embedding_dropout(embedding_out)
        rnn_out, (h, c) = self.rnn(embedding_out, states)
        out = self.output_linear(rnn_out)
        return out, (h, c)
--- a/egs/fluent_speech_commands/SLU/transducer/decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decoder.py
@ -0,0 +1 @@
 ../../../yesno/ASR/transducer/decoder.py
--- a/egs/fluent_speech_commands/SLU/transducer/encoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/encoder.py
@ -1,87 +0,0 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.nn as nn
 # We use a TDNN model as encoder, as it works very well with CTC training
 # for this tiny dataset.
 class Tdnn(nn.Module):
    def __init__(self, num_features: int, output_dim: int):
        """
        Args:
          num_features:
            Model input dimension.
          ouput_dim:
            Model output dimension
        """
        super().__init__()
        # Note: We don't use paddings inside conv layers
        self.tdnn = nn.Sequential(
            nn.Conv1d(
                in_channels=num_features,
                out_channels=32,
                kernel_size=3,
            ),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=32, affine=False),
            nn.Conv1d(
                in_channels=32,
                out_channels=32,
                kernel_size=5,
                dilation=2,
            ),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=32, affine=False),
            nn.Conv1d(
                in_channels=32,
                out_channels=32,
                kernel_size=5,
                dilation=4,
            ),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=32, affine=False),
        )
        self.output_linear = nn.Linear(in_features=32, out_features=output_dim)
    def forward(self, x: torch.Tensor, x_lens: torch.Tensor) -> torch.Tensor:
        """
        Args:
          x:
            The input tensor with shape (N, T, C)
          x_lens:
            It contains the number of frames in each utterance in x
            before padding.
        Returns:
          Return a tuple with 2 tensors:
            - logits, a tensor of shape (N, T, C)
            - logit_lens, a tensor of shape (N,)
        """
        x = x.permute(0, 2, 1)  # (N, T, C) -> (N, C, T)
        x = self.tdnn(x)
        x = x.permute(0, 2, 1)  # (N, C, T) -> (N, T, C)
        logits = self.output_linear(x)
        # the first conv layer reduces T by 3-1 frames
        # the second layer reduces T by (5-1)*2 frames
        # the second layer reduces T by (5-1)*4 frames
        # Number of output frames is 2 + 4*2 + 4*4 = 2 + 8 + 16 = 26
        x_lens = x_lens - 26
        return logits, x_lens
--- a/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
+++ b/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
@ -1,43 +0,0 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Tuple
 import torch
 import torch.nn as nn
 class EncoderInterface(nn.Module):
    def forward(
        self, x: torch.Tensor, x_lens: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
          x:
            A tensor of shape (batch_size, input_seq_len, num_features)
            containing the input features.
          x_lens:
            A tensor of shape (batch_size,) containing the number of frames
            in `x` before padding.
        Returns:
          Return a tuple containing two tensors:
            - encoder_out, a tensor of (batch_size, out_seq_len, output_dim)
              containing unnormalized probabilities, i.e., the output of a
              linear layer.
            - encoder_out_lens, a tensor of shape (batch_size,) containing
              the number of frames in `encoder_out` before padding.
        """
        raise NotImplementedError("Please implement it in a subclass")
--- a/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
+++ b/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/encoder_interface.py
--- a/egs/fluent_speech_commands/SLU/transducer/joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/joiner.py
@ -1,55 +0,0 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class Joiner(nn.Module):
    def __init__(self, input_dim: int, output_dim: int):
        super().__init__()
        self.output_linear = nn.Linear(input_dim, output_dim)
    def forward(
        self, encoder_out: torch.Tensor, decoder_out: torch.Tensor
    ) -> torch.Tensor:
        """
        Args:
          encoder_out:
            Output from the encoder. Its shape is (N, T, C).
          decoder_out:
            Output from the decoder. Its shape is (N, U, C).
        Returns:
          Return a tensor of shape (N, T, U, C).
        """
        assert encoder_out.ndim == decoder_out.ndim == 3
        assert encoder_out.size(0) == decoder_out.size(0)
        assert encoder_out.size(2) == decoder_out.size(2)
        encoder_out = encoder_out.unsqueeze(2)
        # Now encoder_out is (N, T, 1, C)
        decoder_out = decoder_out.unsqueeze(1)
        # Now decoder_out is (N, 1, U, C)
        logit = encoder_out + decoder_out
        logit = F.relu(logit)
        output = self.output_linear(logit)
        return output
--- a/egs/fluent_speech_commands/SLU/transducer/joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/joiner.py
--- a/egs/fluent_speech_commands/SLU/transducer/model.py
+++ b/egs/fluent_speech_commands/SLU/transducer/model.py
@ -1,120 +0,0 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Note we use `rnnt_loss` from torchaudio, which exists only in
 torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0
 """
 import k2
 import torch
 import torch.nn as nn
 import torchaudio
 import torchaudio.functional
 from icefall.utils import add_sos
 assert hasattr(torchaudio.functional, "rnnt_loss"), (
    f"Current torchaudio version: {torchaudio.__version__}\n"
    "Please install a version >= 0.10.0"
 )
 class Transducer(nn.Module):
    """It implements https://arxiv.org/pdf/1211.3711.pdf
    "Sequence Transduction with Recurrent Neural Networks"
    """
    def __init__(
        self,
        encoder: nn.Module,
        decoder: nn.Module,
        joiner: nn.Module,
    ):
        """
        Args:
          encoder:
            It is the transcription network in the paper. Its accepts
            two inputs: `x` of (N, T, C) and `x_lens` of shape (N,).
            It returns two tensors: `logits` of shape (N, T, C) and
            `logit_lens` of shape (N,).
          decoder:
            It is the prediction network in the paper. Its input shape
            is (N, U) and its output shape is (N, U, C). It should contain
            one attribute: `blank_id`.
          joiner:
            It has two inputs with shapes: (N, T, C) and (N, U, C). Its
            output shape is (N, T, U, C). Note that its output contains
            unnormalized probs, i.e., not processed by log-softmax.
        """
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.joiner = joiner
    def forward(
        self,
        x: torch.Tensor,
        x_lens: torch.Tensor,
        y: k2.RaggedTensor,
    ) -> torch.Tensor:
        """
        Args:
          x:
            A 3-D tensor of shape (N, T, C).
          x_lens:
            A 1-D tensor of shape (N,). It contains the number of frames in `x`
            before padding.
          y:
            A ragged tensor with 2 axes [utt][label]. It contains labels of each
            utterance.
        Returns:
          Return the transducer loss.
        """
        assert x.ndim == 3, x.shape
        assert x_lens.ndim == 1, x_lens.shape
        assert y.num_axes == 2, y.num_axes
        assert x.size(0) == x_lens.size(0) == y.dim0
        encoder_out, x_lens = self.encoder(x, x_lens)
        assert torch.all(x_lens > 0)
        # Now for the decoder, i.e., the prediction network
        row_splits = y.shape.row_splits(1)
        y_lens = row_splits[1:] - row_splits[:-1]
        blank_id = self.decoder.blank_id
        sos_y = add_sos(y, sos_id=blank_id)
        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
        decoder_out, _ = self.decoder(sos_y_padded)
        logits = self.joiner(encoder_out, decoder_out)
        # rnnt_loss requires 0 padded targets
        y_padded = y.pad(mode="constant", padding_value=0)
        loss = torchaudio.functional.rnnt_loss(
            logits=logits,
            targets=y_padded,
            logit_lengths=x_lens,
            target_lengths=y_lens,
            blank=blank_id,
            reduction="mean",
        )
        return loss
--- a/egs/fluent_speech_commands/SLU/transducer/model.py
+++ b/egs/fluent_speech_commands/SLU/transducer/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/model.py
--- a/egs/fluent_speech_commands/SLU/transducer/subsampling.py
+++ b/egs/fluent_speech_commands/SLU/transducer/subsampling.py
@ -1,153 +0,0 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.nn as nn
 class Conv2dSubsampling(nn.Module):
    """Convolutional 2D subsampling (to 1/4 length).
    Convert an input of shape (N, T, idim) to an output
    with shape (N, T', odim), where
    T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
    It is based on
    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
    """
    def __init__(self, idim: int, odim: int) -> None:
        """
        Args:
          idim:
            Input dim. The input shape is (N, T, idim).
            Caution: It requires: T >=7, idim >=7
          odim:
            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
        """
        assert idim >= 7
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=odim, kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=odim, out_channels=odim, kernel_size=3, stride=2),
            nn.ReLU(),
        )
        self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Subsample x.
        Args:
          x:
            Its shape is (N, T, idim).
        Returns:
          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
        """
        # On entry, x is (N, T, idim)
        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
        x = self.conv(x)
        # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
        return x
 class VggSubsampling(nn.Module):
    """Trying to follow the setup described in the following paper:
    https://arxiv.org/pdf/1910.09799.pdf
    This paper is not 100% explicit so I am guessing to some extent,
    and trying to compare with other VGG implementations.
    Convert an input of shape (N, T, idim) to an output
    with shape (N, T', odim), where
    T' = ((T-1)//2 - 1)//2, which approximates T' = T//4
    """
    def __init__(self, idim: int, odim: int) -> None:
        """Construct a VggSubsampling object.
        This uses 2 VGG blocks with 2 Conv2d layers each,
        subsampling its input by a factor of 4 in the time dimensions.
        Args:
          idim:
            Input dim. The input shape is (N, T, idim).
            Caution: It requires: T >=7, idim >=7
          odim:
            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
        """
        super().__init__()
        cur_channels = 1
        layers = []
        block_dims = [32, 64]
        # The decision to use padding=1 for the 1st convolution, then padding=0
        # for the 2nd and for the max-pooling, and ceil_mode=True, was driven by
        # a back-compatibility concern so that the number of frames at the
        # output would be equal to:
        #  (((T-1)//2)-1)//2.
        # We can consider changing this by using padding=1 on the
        # 2nd convolution, so the num-frames at the output would be T//4.
        for block_dim in block_dims:
            layers.append(
                torch.nn.Conv2d(
                    in_channels=cur_channels,
                    out_channels=block_dim,
                    kernel_size=3,
                    padding=1,
                    stride=1,
                )
            )
            layers.append(torch.nn.ReLU())
            layers.append(
                torch.nn.Conv2d(
                    in_channels=block_dim,
                    out_channels=block_dim,
                    kernel_size=3,
                    padding=0,
                    stride=1,
                )
            )
            layers.append(
                torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, ceil_mode=True)
            )
            cur_channels = block_dim
        self.layers = nn.Sequential(*layers)
        self.out = nn.Linear(block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Subsample x.
        Args:
          x:
            Its shape is (N, T, idim).
        Returns:
          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
        """
        x = x.unsqueeze(1)
        x = self.layers(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        return x
--- a/egs/fluent_speech_commands/SLU/transducer/subsampling.py
+++ b/egs/fluent_speech_commands/SLU/transducer/subsampling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/subsampling.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_conformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/test_conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
@ -1,65 +0,0 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 To run this file, do:
    cd icefall/egs/yesno/ASR
    python ./transducer/test_decoder.py
 """
 import torch
 from transducer.decoder import Decoder
 def test_decoder():
    vocab_size = 3
    blank_id = 0
    embedding_dim = 128
    num_layers = 2
    hidden_dim = 6
    N = 3
    U = 5
    decoder = Decoder(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        blank_id=blank_id,
        num_layers=num_layers,
        hidden_dim=hidden_dim,
        embedding_dropout=0.0,
        rnn_dropout=0.0,
    )
    x = torch.randint(1, vocab_size, (N, U))
    rnn_out, (h, c) = decoder(x)
    assert rnn_out.shape == (N, U, hidden_dim)
    assert h.shape == (num_layers, N, hidden_dim)
    assert c.shape == (num_layers, N, hidden_dim)
    rnn_out, (h, c) = decoder(x, (h, c))
    assert rnn_out.shape == (N, U, hidden_dim)
    assert h.shape == (num_layers, N, hidden_dim)
    assert c.shape == (num_layers, N, hidden_dim)
 def main():
    test_decoder()
 if __name__ == "__main__":
    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
@ -0,0 +1 @@
 ../../../yesno/ASR/transducer/test_decoder.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_encoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_encoder.py
@ -1,47 +0,0 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 To run this file, do:
    cd icefall/egs/yesno/ASR
    python ./transducer/test_encoder.py
 """
 import torch
 from transducer.encoder import Tdnn
 def test_encoder():
    input_dim = 10
    output_dim = 20
    encoder = Tdnn(input_dim, output_dim)
    N = 10
    T = 85
    x = torch.rand(N, T, input_dim)
    x_lens = torch.randint(low=30, high=T, size=(N,), dtype=torch.int32)
    logits, logit_lens = encoder(x, x_lens)
    assert logits.shape == (N, T - 26, output_dim)
    assert torch.all(torch.eq(x_lens - 26, logit_lens))
 def main():
    test_encoder()
 if __name__ == "__main__":
    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
@ -1,50 +0,0 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 To run this file, do:
    cd icefall/egs/yesno/ASR
    python ./transducer/test_joiner.py
 """
 import torch
 from transducer.joiner import Joiner
 def test_joiner():
    N = 2
    T = 3
    C = 4
    U = 5
    joiner = Joiner(C, 10)
    encoder_out = torch.rand(N, T, C)
    decoder_out = torch.rand(N, U, C)
    joint = joiner(encoder_out, decoder_out)
    assert joint.shape == (N, T, U, 10)
 def main():
    test_joiner()
 if __name__ == "__main__":
    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/test_joiner.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
@ -1,77 +0,0 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 To run this file, do:
    cd icefall/egs/yesno/ASR
    python ./transducer/test_transducer.py
 """
 import k2
 import torch
 from transducer.decoder import Decoder
 from transducer.encoder import Tdnn
 from transducer.joiner import Joiner
 from transducer.model import Transducer
 def test_transducer():
    # encoder params
    input_dim = 10
    output_dim = 20
    # decoder params
    vocab_size = 3
    blank_id = 0
    embedding_dim = 128
    num_layers = 2
    encoder = Tdnn(input_dim, output_dim)
    decoder = Decoder(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        blank_id=blank_id,
        num_layers=num_layers,
        hidden_dim=output_dim,
        embedding_dropout=0.0,
        rnn_dropout=0.0,
    )
    joiner = Joiner(output_dim, vocab_size)
    transducer = Transducer(encoder=encoder, decoder=decoder, joiner=joiner)
    y = k2.RaggedTensor([[1, 2, 1], [1, 1, 1, 2, 1]])
    N = y.dim0
    T = 50
    x = torch.rand(N, T, input_dim)
    x_lens = torch.randint(low=30, high=T, size=(N,), dtype=torch.int32)
    x_lens[0] = T
    loss = transducer(x, x_lens, y)
    print(loss)
 def main():
    test_transducer()
 if __name__ == "__main__":
    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer/test_transducer.py
--- a/egs/fluent_speech_commands/SLU/transducer/train.py
+++ b/egs/fluent_speech_commands/SLU/transducer/train.py
@ -33,7 +33,6 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
 # from torch.utils.tensorboard import SummaryWriter
 from transducer.decoder import Decoder
 from transducer.encoder import Tdnn
 from transducer.conformer import Conformer
 from transducer.joiner import Joiner
 from transducer.model import Transducer
@ -492,10 +491,6 @@ def train_one_epoch(
 def get_transducer_model(params: AttributeDict):
    # encoder = Tdnn(
    #     num_features=params.feature_dim,
    #     output_dim=params.hidden_dim,
    # )
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.hidden_dim,
--- a/egs/fluent_speech_commands/SLU/transducer/transformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/transformer.py
@ -1,416 +0,0 @@
 # Copyright    2021 University of Chinese Academy of Sciences (author: Han Zhu)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 from transducer.encoder_interface import EncoderInterface
 from transducer.subsampling import Conv2dSubsampling, VggSubsampling
 from icefall.utils import make_pad_mask
 class Transformer(EncoderInterface):
    def __init__(
        self,
        num_features: int,
        output_dim: int,
        subsampling_factor: int = 4,
        d_model: int = 256,
        nhead: int = 4,
        dim_feedforward: int = 2048,
        num_encoder_layers: int = 12,
        dropout: float = 0.1,
        normalize_before: bool = True,
        vgg_frontend: bool = False,
    ) -> None:
        """
        Args:
          num_features:
            The input dimension of the model.
          output_dim:
            The output dimension of the model.
          subsampling_factor:
            Number of output frames is num_in_frames // subsampling_factor.
            Currently, subsampling_factor MUST be 4.
          d_model:
            Attention dimension.
          nhead:
            Number of heads in multi-head attention.
            Must satisfy d_model // nhead == 0.
          dim_feedforward:
            The output dimension of the feedforward layers in encoder.
          num_encoder_layers:
            Number of encoder layers.
          dropout:
            Dropout in encoder.
          normalize_before:
            If True, use pre-layer norm; False to use post-layer norm.
          vgg_frontend:
            True to use vgg style frontend for subsampling.
        """
        super().__init__()
        self.num_features = num_features
        self.output_dim = output_dim
        self.subsampling_factor = subsampling_factor
        if subsampling_factor != 4:
            raise NotImplementedError("Support only 'subsampling_factor=4'.")
        # self.encoder_embed converts the input of shape (N, T, num_features)
        # to the shape (N, T//subsampling_factor, d_model).
        # That is, it does two things simultaneously:
        #   (1) subsampling: T -> T//subsampling_factor
        #   (2) embedding: num_features -> d_model
        if vgg_frontend:
            self.encoder_embed = VggSubsampling(num_features, d_model)
        else:
            self.encoder_embed = Conv2dSubsampling(num_features, d_model)
        self.encoder_pos = PositionalEncoding(d_model, dropout)
        encoder_layer = TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            normalize_before=normalize_before,
        )
        if normalize_before:
            encoder_norm = nn.LayerNorm(d_model)
        else:
            encoder_norm = None
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_encoder_layers,
            norm=encoder_norm,
        )
        # TODO(fangjun): remove dropout
        self.encoder_output_layer = nn.Sequential(
            nn.Dropout(p=dropout), nn.Linear(d_model, output_dim)
        )
    def forward(
        self, x: torch.Tensor, x_lens: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
          x:
            The input tensor. Its shape is (batch_size, seq_len, feature_dim).
          x_lens:
            A tensor of shape (batch_size,) containing the number of frames in
            `x` before padding.
        Returns:
          Return a tuple containing 2 tensors:
            - logits, its shape is (batch_size, output_seq_len, output_dim)
            - logit_lens, a tensor of shape (batch_size,) containing the number
              of frames in `logits` before padding.
        """
        x = self.encoder_embed(x)
        x = self.encoder_pos(x)
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
        # Caution: We assume the subsampling factor is 4!
        lengths = ((x_lens - 1) // 2 - 1) // 2
        assert x.size(0) == lengths.max().item()
        mask = make_pad_mask(lengths)
        x = self.encoder(x, src_key_padding_mask=mask)  # (T, N, C)
        logits = self.encoder_output_layer(x)
        logits = logits.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
        return logits, lengths
 class TransformerEncoderLayer(nn.Module):
    """
    Modified from torch.nn.TransformerEncoderLayer.
    Add support of normalize_before,
    i.e., use layer_norm before the first block.
    Args:
      d_model:
        the number of expected features in the input (required).
      nhead:
        the number of heads in the multiheadattention models (required).
      dim_feedforward:
        the dimension of the feedforward network model (default=2048).
      dropout:
        the dropout value (default=0.1).
      activation:
        the activation function of intermediate layer, relu or
        gelu (default=relu).
      normalize_before:
        whether to use layer_norm before the first block.
    Examples::
        >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> out = encoder_layer(src)
    """
    def __init__(
        self,
        d_model: int,
        nhead: int,
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
        activation: str = "relu",
        normalize_before: bool = True,
    ) -> None:
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = _get_activation_fn(activation)
        self.normalize_before = normalize_before
    def __setstate__(self, state):
        if "activation" not in state:
            state["activation"] = nn.functional.relu
        super(TransformerEncoderLayer, self).__setstate__(state)
    def forward(
        self,
        src: torch.Tensor,
        src_mask: Optional[torch.Tensor] = None,
        src_key_padding_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """
        Pass the input through the encoder layer.
        Args:
            src: the sequence to the encoder layer (required).
            src_mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional)
        Shape:
            src: (S, N, E).
            src_mask: (S, S).
            src_key_padding_mask: (N, S).
            S is the source sequence length, T is the target sequence length,
            N is the batch size, E is the feature number
        """
        residual = src
        if self.normalize_before:
            src = self.norm1(src)
        src2 = self.self_attn(
            src,
            src,
            src,
            attn_mask=src_mask,
            key_padding_mask=src_key_padding_mask,
        )[0]
        src = residual + self.dropout1(src2)
        if not self.normalize_before:
            src = self.norm1(src)
        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src2)
        if not self.normalize_before:
            src = self.norm2(src)
        return src
 def _get_activation_fn(activation: str):
    if activation == "relu":
        return nn.functional.relu
    elif activation == "gelu":
        return nn.functional.gelu
    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
 class PositionalEncoding(nn.Module):
    """This class implements the positional encoding
    proposed in the following paper:
    - Attention Is All You Need: https://arxiv.org/pdf/1706.03762.pdf
        PE(pos, 2i) = sin(pos / (10000^(2i/d_modle))
        PE(pos, 2i+1) = cos(pos / (10000^(2i/d_modle))
    Note::
      1 / (10000^(2i/d_model)) = exp(-log(10000^(2i/d_model)))
                               = exp(-1* 2i / d_model * log(100000))
                               = exp(2i * -(log(10000) / d_model))
    """
    def __init__(self, d_model: int, dropout: float = 0.1) -> None:
        """
        Args:
          d_model:
            Embedding dimension.
          dropout:
            Dropout probability to be applied to the output of this module.
        """
        super().__init__()
        self.d_model = d_model
        self.xscale = math.sqrt(self.d_model)
        self.dropout = nn.Dropout(p=dropout)
        # not doing: self.pe = None because of errors thrown by torchscript
        self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32)
    def extend_pe(self, x: torch.Tensor) -> None:
        """Extend the time t in the positional encoding if required.
        The shape of `self.pe` is (1, T1, d_model). The shape of the input x
        is (N, T, d_model). If T > T1, then we change the shape of self.pe
        to (N, T, d_model). Otherwise, nothing is done.
        Args:
          x:
            It is a tensor of shape (N, T, C).
        Returns:
          Return None.
        """
        if self.pe is not None:
            if self.pe.size(1) >= x.size(1):
                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        pe = torch.zeros(x.size(1), self.d_model, dtype=torch.float32)
        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float32)
            * -(math.log(10000.0) / self.d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        # Now pe is of shape (1, T, d_model), where T is x.size(1)
        self.pe = pe.to(device=x.device, dtype=x.dtype)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Add positional encoding.
        Args:
          x:
            Its shape is (N, T, C)
        Returns:
          Return a tensor of shape (N, T, C)
        """
        self.extend_pe(x)
        x = x * self.xscale + self.pe[:, : x.size(1), :]
        return self.dropout(x)
 class Noam(object):
    """
    Implements Noam optimizer.
    Proposed in
    "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
    Modified from
    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py  # noqa
    Args:
      params:
        iterable of parameters to optimize or dicts defining parameter groups
      model_size:
        attention dimension of the transformer model
      factor:
        learning rate factor
      warm_step:
        warmup steps
    """
    def __init__(
        self,
        params,
        model_size: int = 256,
        factor: float = 10.0,
        warm_step: int = 25000,
        weight_decay=0,
    ) -> None:
        """Construct an Noam object."""
        self.optimizer = torch.optim.Adam(
            params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay
        )
        self._step = 0
        self.warmup = warm_step
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
    @property
    def param_groups(self):
        """Return param_groups."""
        return self.optimizer.param_groups
    def step(self):
        """Update parameters and rate."""
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p["lr"] = rate
        self._rate = rate
        self.optimizer.step()
    def rate(self, step=None):
        """Implement `lrate` above."""
        if step is None:
            step = self._step
        return (
            self.factor
            * self.model_size ** (-0.5)
            * min(step ** (-0.5), step * self.warmup ** (-1.5))
        )
    def zero_grad(self):
        """Reset gradient."""
        self.optimizer.zero_grad()
    def state_dict(self):
        """Return state_dict."""
        return {
            "_step": self._step,
            "warmup": self.warmup,
            "factor": self.factor,
            "model_size": self.model_size,
            "_rate": self._rate,
            "optimizer": self.optimizer.state_dict(),
        }
    def load_state_dict(self, state_dict):
        """Load state_dict."""
        for key, value in state_dict.items():
            if key == "optimizer":
                self.optimizer.load_state_dict(state_dict["optimizer"])
            else:
                setattr(self, key, value)
--- a/egs/fluent_speech_commands/SLU/transducer/transformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/transformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/transformer.py
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer_stateless/conformer.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer/joiner.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer/model.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer/test_conformer.py`
		`@ -0,0 +1 @@`
							`../../../yesno/ASR/transducer/test_decoder.py`