Use symlinks whenever possible

Signed-off-by: Xinyuan Li <xli257@b17.clsp.jhu.edu>
2024-01-23 23:21:37 -05:00 · 2024-01-23 23:21:37 -05:00 · 8dc1ca194d
commit 8dc1ca194d
parent d725bad4fd
15 changed files with 11 additions and 2656 deletions
--- a/egs/fluent_speech_commands/SLU/transducer/conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/conformer.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer_stateless/conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/decode.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decode.py
@ -25,7 +25,6 @@ import torch.nn as nn
 from transducer.slu_datamodule import SluDataModule
 from transducer.beam_search import greedy_search
 from transducer.decoder import Decoder
-from transducer.encoder import Tdnn
 from transducer.conformer import Conformer
 from transducer.joiner import Joiner
 from transducer.model import Transducer
--- a/egs/fluent_speech_commands/SLU/transducer/decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decoder.py
@ -1,92 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        vocab_size: int,
-        embedding_dim: int,
-        blank_id: int,
-        num_layers: int,
-        hidden_dim: int,
-        embedding_dropout: float = 0.0,
-        rnn_dropout: float = 0.0,
-    ):
-        """
-        Args:
-          vocab_size:
-            Number of tokens of the modeling unit.
-          embedding_dim:
-            Dimension of the input embedding.
-          blank_id:
-            The ID of the blank symbol.
-          num_layers:
-            Number of RNN layers.
-          hidden_dim:
-            Hidden dimension of RNN layers.
-          embedding_dropout:
-            Dropout rate for the embedding layer.
-          rnn_dropout:
-            Dropout for RNN layers.
-        """
-        super().__init__()
-        self.embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=embedding_dim,
-            padding_idx=blank_id,
-        )
-        self.embedding_dropout = nn.Dropout(embedding_dropout)
-        self.rnn = nn.LSTM(
-            input_size=embedding_dim,
-            hidden_size=hidden_dim,
-            num_layers=num_layers,
-            batch_first=True,
-            dropout=rnn_dropout,
-        )
-        self.blank_id = blank_id
-        self.output_linear = nn.Linear(hidden_dim, hidden_dim)
-
-    def forward(
-        self,
-        y: torch.Tensor,
-        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Args:
-          y:
-            A 2-D tensor of shape (N, U).
-          states:
-            A tuple of two tensors containing the states information of
-            RNN layers in this decoder.
-        Returns:
-          Return a tuple containing:
-
-            - rnn_output, a tensor of shape (N, U, C)
-            - (h, c), which contain the state information for RNN layers.
-              Both are of shape (num_layers, N, C)
-        """
-        embedding_out = self.embedding(y)
-        embedding_out = self.embedding_dropout(embedding_out)
-        rnn_out, (h, c) = self.rnn(embedding_out, states)
-        out = self.output_linear(rnn_out)
-
-        return out, (h, c)
--- a/egs/fluent_speech_commands/SLU/transducer/decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/decoder.py
@ -0,0 +1 @@
+../../../yesno/ASR/transducer/decoder.py
--- a/egs/fluent_speech_commands/SLU/transducer/encoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/encoder.py
@ -1,87 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-
-
-# We use a TDNN model as encoder, as it works very well with CTC training
-# for this tiny dataset.
-class Tdnn(nn.Module):
-    def __init__(self, num_features: int, output_dim: int):
-        """
-        Args:
-          num_features:
-            Model input dimension.
-          ouput_dim:
-            Model output dimension
-        """
-        super().__init__()
-
-        # Note: We don't use paddings inside conv layers
-        self.tdnn = nn.Sequential(
-            nn.Conv1d(
-                in_channels=num_features,
-                out_channels=32,
-                kernel_size=3,
-            ),
-            nn.ReLU(inplace=True),
-            nn.BatchNorm1d(num_features=32, affine=False),
-            nn.Conv1d(
-                in_channels=32,
-                out_channels=32,
-                kernel_size=5,
-                dilation=2,
-            ),
-            nn.ReLU(inplace=True),
-            nn.BatchNorm1d(num_features=32, affine=False),
-            nn.Conv1d(
-                in_channels=32,
-                out_channels=32,
-                kernel_size=5,
-                dilation=4,
-            ),
-            nn.ReLU(inplace=True),
-            nn.BatchNorm1d(num_features=32, affine=False),
-        )
-        self.output_linear = nn.Linear(in_features=32, out_features=output_dim)
-
-    def forward(self, x: torch.Tensor, x_lens: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-          x:
-            The input tensor with shape (N, T, C)
-          x_lens:
-            It contains the number of frames in each utterance in x
-            before padding.
-
-        Returns:
-          Return a tuple with 2 tensors:
-
-            - logits, a tensor of shape (N, T, C)
-            - logit_lens, a tensor of shape (N,)
-        """
-        x = x.permute(0, 2, 1)  # (N, T, C) -> (N, C, T)
-        x = self.tdnn(x)
-        x = x.permute(0, 2, 1)  # (N, C, T) -> (N, T, C)
-        logits = self.output_linear(x)
-
-        # the first conv layer reduces T by 3-1 frames
-        # the second layer reduces T by (5-1)*2 frames
-        # the second layer reduces T by (5-1)*4 frames
-        # Number of output frames is 2 + 4*2 + 4*4 = 2 + 8 + 16 = 26
-        x_lens = x_lens - 26
-        return logits, x_lens
--- a/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
+++ b/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
@ -1,43 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-import torch.nn as nn
-
-
-class EncoderInterface(nn.Module):
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-          x:
-            A tensor of shape (batch_size, input_seq_len, num_features)
-            containing the input features.
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames
-            in `x` before padding.
-        Returns:
-          Return a tuple containing two tensors:
-            - encoder_out, a tensor of (batch_size, out_seq_len, output_dim)
-              containing unnormalized probabilities, i.e., the output of a
-              linear layer.
-            - encoder_out_lens, a tensor of shape (batch_size,) containing
-              the number of frames in `encoder_out` before padding.
-        """
-        raise NotImplementedError("Please implement it in a subclass")
--- a/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
+++ b/egs/fluent_speech_commands/SLU/transducer/encoder_interface.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer_stateless/encoder_interface.py
--- a/egs/fluent_speech_commands/SLU/transducer/joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/joiner.py
@ -1,55 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Joiner(nn.Module):
-    def __init__(self, input_dim: int, output_dim: int):
-        super().__init__()
-
-        self.output_linear = nn.Linear(input_dim, output_dim)
-
-    def forward(
-        self, encoder_out: torch.Tensor, decoder_out: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Args:
-          encoder_out:
-            Output from the encoder. Its shape is (N, T, C).
-          decoder_out:
-            Output from the decoder. Its shape is (N, U, C).
-        Returns:
-          Return a tensor of shape (N, T, U, C).
-        """
-        assert encoder_out.ndim == decoder_out.ndim == 3
-        assert encoder_out.size(0) == decoder_out.size(0)
-        assert encoder_out.size(2) == decoder_out.size(2)
-
-        encoder_out = encoder_out.unsqueeze(2)
-        # Now encoder_out is (N, T, 1, C)
-
-        decoder_out = decoder_out.unsqueeze(1)
-        # Now decoder_out is (N, 1, U, C)
-
-        logit = encoder_out + decoder_out
-        logit = F.relu(logit)
-
-        output = self.output_linear(logit)
-
-        return output
--- a/egs/fluent_speech_commands/SLU/transducer/joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/joiner.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer/joiner.py
--- a/egs/fluent_speech_commands/SLU/transducer/model.py
+++ b/egs/fluent_speech_commands/SLU/transducer/model.py
@ -1,120 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Note we use `rnnt_loss` from torchaudio, which exists only in
-torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0
-"""
-import k2
-import torch
-import torch.nn as nn
-import torchaudio
-import torchaudio.functional
-
-from icefall.utils import add_sos
-
-assert hasattr(torchaudio.functional, "rnnt_loss"), (
-    f"Current torchaudio version: {torchaudio.__version__}\n"
-    "Please install a version >= 0.10.0"
-)
-
-
-class Transducer(nn.Module):
-    """It implements https://arxiv.org/pdf/1211.3711.pdf
-    "Sequence Transduction with Recurrent Neural Networks"
-    """
-
-    def __init__(
-        self,
-        encoder: nn.Module,
-        decoder: nn.Module,
-        joiner: nn.Module,
-    ):
-        """
-        Args:
-          encoder:
-            It is the transcription network in the paper. Its accepts
-            two inputs: `x` of (N, T, C) and `x_lens` of shape (N,).
-            It returns two tensors: `logits` of shape (N, T, C) and
-            `logit_lens` of shape (N,).
-          decoder:
-            It is the prediction network in the paper. Its input shape
-            is (N, U) and its output shape is (N, U, C). It should contain
-            one attribute: `blank_id`.
-          joiner:
-            It has two inputs with shapes: (N, T, C) and (N, U, C). Its
-            output shape is (N, T, U, C). Note that its output contains
-            unnormalized probs, i.e., not processed by log-softmax.
-        """
-        super().__init__()
-        self.encoder = encoder
-        self.decoder = decoder
-        self.joiner = joiner
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: k2.RaggedTensor,
-    ) -> torch.Tensor:
-        """
-        Args:
-          x:
-            A 3-D tensor of shape (N, T, C).
-          x_lens:
-            A 1-D tensor of shape (N,). It contains the number of frames in `x`
-            before padding.
-          y:
-            A ragged tensor with 2 axes [utt][label]. It contains labels of each
-            utterance.
-        Returns:
-          Return the transducer loss.
-        """
-        assert x.ndim == 3, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert y.num_axes == 2, y.num_axes
-
-        assert x.size(0) == x_lens.size(0) == y.dim0
-
-        encoder_out, x_lens = self.encoder(x, x_lens)
-        assert torch.all(x_lens > 0)
-
-        # Now for the decoder, i.e., the prediction network
-        row_splits = y.shape.row_splits(1)
-        y_lens = row_splits[1:] - row_splits[:-1]
-
-        blank_id = self.decoder.blank_id
-        sos_y = add_sos(y, sos_id=blank_id)
-
-        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
-
-        decoder_out, _ = self.decoder(sos_y_padded)
-
-        logits = self.joiner(encoder_out, decoder_out)
-
-        # rnnt_loss requires 0 padded targets
-        y_padded = y.pad(mode="constant", padding_value=0)
-
-        loss = torchaudio.functional.rnnt_loss(
-            logits=logits,
-            targets=y_padded,
-            logit_lengths=x_lens,
-            target_lengths=y_lens,
-            blank=blank_id,
-            reduction="mean",
-        )
-
-        return loss
--- a/egs/fluent_speech_commands/SLU/transducer/model.py
+++ b/egs/fluent_speech_commands/SLU/transducer/model.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer/model.py
--- a/egs/fluent_speech_commands/SLU/transducer/subsampling.py
+++ b/egs/fluent_speech_commands/SLU/transducer/subsampling.py
@ -1,153 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-import torch.nn as nn
-
-
-class Conv2dSubsampling(nn.Module):
-    """Convolutional 2D subsampling (to 1/4 length).
-
-    Convert an input of shape (N, T, idim) to an output
-    with shape (N, T', odim), where
-    T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
-
-    It is based on
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
-    """
-
-    def __init__(self, idim: int, odim: int) -> None:
-        """
-        Args:
-          idim:
-            Input dim. The input shape is (N, T, idim).
-            Caution: It requires: T >=7, idim >=7
-          odim:
-            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
-        """
-        assert idim >= 7
-        super().__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(in_channels=1, out_channels=odim, kernel_size=3, stride=2),
-            nn.ReLU(),
-            nn.Conv2d(in_channels=odim, out_channels=odim, kernel_size=3, stride=2),
-            nn.ReLU(),
-        )
-        self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-
-        Returns:
-          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
-        """
-        # On entry, x is (N, T, idim)
-        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
-        x = self.conv(x)
-        # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2)
-        b, c, t, f = x.size()
-        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
-        return x
-
-
-class VggSubsampling(nn.Module):
-    """Trying to follow the setup described in the following paper:
-    https://arxiv.org/pdf/1910.09799.pdf
-
-    This paper is not 100% explicit so I am guessing to some extent,
-    and trying to compare with other VGG implementations.
-
-    Convert an input of shape (N, T, idim) to an output
-    with shape (N, T', odim), where
-    T' = ((T-1)//2 - 1)//2, which approximates T' = T//4
-    """
-
-    def __init__(self, idim: int, odim: int) -> None:
-        """Construct a VggSubsampling object.
-
-        This uses 2 VGG blocks with 2 Conv2d layers each,
-        subsampling its input by a factor of 4 in the time dimensions.
-
-        Args:
-          idim:
-            Input dim. The input shape is (N, T, idim).
-            Caution: It requires: T >=7, idim >=7
-          odim:
-            Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim)
-        """
-        super().__init__()
-
-        cur_channels = 1
-        layers = []
-        block_dims = [32, 64]
-
-        # The decision to use padding=1 for the 1st convolution, then padding=0
-        # for the 2nd and for the max-pooling, and ceil_mode=True, was driven by
-        # a back-compatibility concern so that the number of frames at the
-        # output would be equal to:
-        #  (((T-1)//2)-1)//2.
-        # We can consider changing this by using padding=1 on the
-        # 2nd convolution, so the num-frames at the output would be T//4.
-        for block_dim in block_dims:
-            layers.append(
-                torch.nn.Conv2d(
-                    in_channels=cur_channels,
-                    out_channels=block_dim,
-                    kernel_size=3,
-                    padding=1,
-                    stride=1,
-                )
-            )
-            layers.append(torch.nn.ReLU())
-            layers.append(
-                torch.nn.Conv2d(
-                    in_channels=block_dim,
-                    out_channels=block_dim,
-                    kernel_size=3,
-                    padding=0,
-                    stride=1,
-                )
-            )
-            layers.append(
-                torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, ceil_mode=True)
-            )
-            cur_channels = block_dim
-
-        self.layers = nn.Sequential(*layers)
-
-        self.out = nn.Linear(block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-
-        Returns:
-          Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
-        """
-        x = x.unsqueeze(1)
-        x = self.layers(x)
-        b, c, t, f = x.size()
-        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        return x
--- a/egs/fluent_speech_commands/SLU/transducer/subsampling.py
+++ b/egs/fluent_speech_commands/SLU/transducer/subsampling.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer_stateless/subsampling.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_conformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_conformer.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer/test_conformer.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
@ -1,65 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-To run this file, do:
-
-    cd icefall/egs/yesno/ASR
-    python ./transducer/test_decoder.py
-"""
-
-import torch
-from transducer.decoder import Decoder
-
-
-def test_decoder():
-    vocab_size = 3
-    blank_id = 0
-    embedding_dim = 128
-    num_layers = 2
-    hidden_dim = 6
-    N = 3
-    U = 5
-
-    decoder = Decoder(
-        vocab_size=vocab_size,
-        embedding_dim=embedding_dim,
-        blank_id=blank_id,
-        num_layers=num_layers,
-        hidden_dim=hidden_dim,
-        embedding_dropout=0.0,
-        rnn_dropout=0.0,
-    )
-    x = torch.randint(1, vocab_size, (N, U))
-    rnn_out, (h, c) = decoder(x)
-
-    assert rnn_out.shape == (N, U, hidden_dim)
-    assert h.shape == (num_layers, N, hidden_dim)
-    assert c.shape == (num_layers, N, hidden_dim)
-
-    rnn_out, (h, c) = decoder(x, (h, c))
-    assert rnn_out.shape == (N, U, hidden_dim)
-    assert h.shape == (num_layers, N, hidden_dim)
-    assert c.shape == (num_layers, N, hidden_dim)
-
-
-def main():
-    test_decoder()
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_decoder.py
@ -0,0 +1 @@
+../../../yesno/ASR/transducer/test_decoder.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_encoder.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_encoder.py
@ -1,47 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-To run this file, do:
-
-    cd icefall/egs/yesno/ASR
-    python ./transducer/test_encoder.py
-"""
-
-import torch
-from transducer.encoder import Tdnn
-
-
-def test_encoder():
-    input_dim = 10
-    output_dim = 20
-    encoder = Tdnn(input_dim, output_dim)
-    N = 10
-    T = 85
-    x = torch.rand(N, T, input_dim)
-    x_lens = torch.randint(low=30, high=T, size=(N,), dtype=torch.int32)
-    logits, logit_lens = encoder(x, x_lens)
-    assert logits.shape == (N, T - 26, output_dim)
-    assert torch.all(torch.eq(x_lens - 26, logit_lens))
-
-
-def main():
-    test_encoder()
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-To run this file, do:
-
-    cd icefall/egs/yesno/ASR
-    python ./transducer/test_joiner.py
-"""
-
-
-import torch
-from transducer.joiner import Joiner
-
-
-def test_joiner():
-    N = 2
-    T = 3
-    C = 4
-    U = 5
-
-    joiner = Joiner(C, 10)
-
-    encoder_out = torch.rand(N, T, C)
-    decoder_out = torch.rand(N, U, C)
-
-    joint = joiner(encoder_out, decoder_out)
-    assert joint.shape == (N, T, U, 10)
-
-
-def main():
-    test_joiner()
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_joiner.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer/test_joiner.py
--- a/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-To run this file, do:
-
-    cd icefall/egs/yesno/ASR
-    python ./transducer/test_transducer.py
-"""
-
-
-import k2
-import torch
-from transducer.decoder import Decoder
-from transducer.encoder import Tdnn
-from transducer.joiner import Joiner
-from transducer.model import Transducer
-
-
-def test_transducer():
-    # encoder params
-    input_dim = 10
-    output_dim = 20
-
-    # decoder params
-    vocab_size = 3
-    blank_id = 0
-    embedding_dim = 128
-    num_layers = 2
-
-    encoder = Tdnn(input_dim, output_dim)
-
-    decoder = Decoder(
-        vocab_size=vocab_size,
-        embedding_dim=embedding_dim,
-        blank_id=blank_id,
-        num_layers=num_layers,
-        hidden_dim=output_dim,
-        embedding_dropout=0.0,
-        rnn_dropout=0.0,
-    )
-
-    joiner = Joiner(output_dim, vocab_size)
-    transducer = Transducer(encoder=encoder, decoder=decoder, joiner=joiner)
-
-    y = k2.RaggedTensor([[1, 2, 1], [1, 1, 1, 2, 1]])
-    N = y.dim0
-    T = 50
-
-    x = torch.rand(N, T, input_dim)
-    x_lens = torch.randint(low=30, high=T, size=(N,), dtype=torch.int32)
-    x_lens[0] = T
-
-    loss = transducer(x, x_lens, y)
-    print(loss)
-
-
-def main():
-    test_transducer()
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/test_transducer.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer/test_transducer.py
--- a/egs/fluent_speech_commands/SLU/transducer/train.py
+++ b/egs/fluent_speech_commands/SLU/transducer/train.py
@ -33,7 +33,6 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
 # from torch.utils.tensorboard import SummaryWriter
 from transducer.decoder import Decoder
-from transducer.encoder import Tdnn
 from transducer.conformer import Conformer
 from transducer.joiner import Joiner
 from transducer.model import Transducer
@ -492,10 +491,6 @@ def train_one_epoch(


 def get_transducer_model(params: AttributeDict):
-    # encoder = Tdnn(
-    #     num_features=params.feature_dim,
-    #     output_dim=params.hidden_dim,
-    # )
    encoder = Conformer(
        num_features=params.feature_dim,
        output_dim=params.hidden_dim,
--- a/egs/fluent_speech_commands/SLU/transducer/transformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/transformer.py
@ -1,416 +0,0 @@
-# Copyright    2021 University of Chinese Academy of Sciences (author: Han Zhu)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import math
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from transducer.encoder_interface import EncoderInterface
-from transducer.subsampling import Conv2dSubsampling, VggSubsampling
-
-from icefall.utils import make_pad_mask
-
-
-class Transformer(EncoderInterface):
-    def __init__(
-        self,
-        num_features: int,
-        output_dim: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        dropout: float = 0.1,
-        normalize_before: bool = True,
-        vgg_frontend: bool = False,
-    ) -> None:
-        """
-        Args:
-          num_features:
-            The input dimension of the model.
-          output_dim:
-            The output dimension of the model.
-          subsampling_factor:
-            Number of output frames is num_in_frames // subsampling_factor.
-            Currently, subsampling_factor MUST be 4.
-          d_model:
-            Attention dimension.
-          nhead:
-            Number of heads in multi-head attention.
-            Must satisfy d_model // nhead == 0.
-          dim_feedforward:
-            The output dimension of the feedforward layers in encoder.
-          num_encoder_layers:
-            Number of encoder layers.
-          dropout:
-            Dropout in encoder.
-          normalize_before:
-            If True, use pre-layer norm; False to use post-layer norm.
-          vgg_frontend:
-            True to use vgg style frontend for subsampling.
-        """
-        super().__init__()
-
-        self.num_features = num_features
-        self.output_dim = output_dim
-        self.subsampling_factor = subsampling_factor
-        if subsampling_factor != 4:
-            raise NotImplementedError("Support only 'subsampling_factor=4'.")
-
-        # self.encoder_embed converts the input of shape (N, T, num_features)
-        # to the shape (N, T//subsampling_factor, d_model).
-        # That is, it does two things simultaneously:
-        #   (1) subsampling: T -> T//subsampling_factor
-        #   (2) embedding: num_features -> d_model
-        if vgg_frontend:
-            self.encoder_embed = VggSubsampling(num_features, d_model)
-        else:
-            self.encoder_embed = Conv2dSubsampling(num_features, d_model)
-
-        self.encoder_pos = PositionalEncoding(d_model, dropout)
-
-        encoder_layer = TransformerEncoderLayer(
-            d_model=d_model,
-            nhead=nhead,
-            dim_feedforward=dim_feedforward,
-            dropout=dropout,
-            normalize_before=normalize_before,
-        )
-
-        if normalize_before:
-            encoder_norm = nn.LayerNorm(d_model)
-        else:
-            encoder_norm = None
-
-        self.encoder = nn.TransformerEncoder(
-            encoder_layer=encoder_layer,
-            num_layers=num_encoder_layers,
-            norm=encoder_norm,
-        )
-
-        # TODO(fangjun): remove dropout
-        self.encoder_output_layer = nn.Sequential(
-            nn.Dropout(p=dropout), nn.Linear(d_model, output_dim)
-        )
-
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-          x:
-            The input tensor. Its shape is (batch_size, seq_len, feature_dim).
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames in
-            `x` before padding.
-        Returns:
-          Return a tuple containing 2 tensors:
-            - logits, its shape is (batch_size, output_seq_len, output_dim)
-            - logit_lens, a tensor of shape (batch_size,) containing the number
-              of frames in `logits` before padding.
-        """
-        x = self.encoder_embed(x)
-        x = self.encoder_pos(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-
-        # Caution: We assume the subsampling factor is 4!
-        lengths = ((x_lens - 1) // 2 - 1) // 2
-        assert x.size(0) == lengths.max().item()
-
-        mask = make_pad_mask(lengths)
-        x = self.encoder(x, src_key_padding_mask=mask)  # (T, N, C)
-
-        logits = self.encoder_output_layer(x)
-        logits = logits.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
-
-        return logits, lengths
-
-
-class TransformerEncoderLayer(nn.Module):
-    """
-    Modified from torch.nn.TransformerEncoderLayer.
-    Add support of normalize_before,
-    i.e., use layer_norm before the first block.
-
-    Args:
-      d_model:
-        the number of expected features in the input (required).
-      nhead:
-        the number of heads in the multiheadattention models (required).
-      dim_feedforward:
-        the dimension of the feedforward network model (default=2048).
-      dropout:
-        the dropout value (default=0.1).
-      activation:
-        the activation function of intermediate layer, relu or
-        gelu (default=relu).
-      normalize_before:
-        whether to use layer_norm before the first block.
-
-    Examples::
-        >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
-        >>> src = torch.rand(10, 32, 512)
-        >>> out = encoder_layer(src)
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        normalize_before: bool = True,
-    ) -> None:
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-
-        self.normalize_before = normalize_before
-
-    def __setstate__(self, state):
-        if "activation" not in state:
-            state["activation"] = nn.functional.relu
-        super(TransformerEncoderLayer, self).__setstate__(state)
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        src_mask: Optional[torch.Tensor] = None,
-        src_key_padding_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Pass the input through the encoder layer.
-
-        Args:
-            src: the sequence to the encoder layer (required).
-            src_mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional)
-
-        Shape:
-            src: (S, N, E).
-            src_mask: (S, S).
-            src_key_padding_mask: (N, S).
-            S is the source sequence length, T is the target sequence length,
-            N is the batch size, E is the feature number
-        """
-        residual = src
-        if self.normalize_before:
-            src = self.norm1(src)
-        src2 = self.self_attn(
-            src,
-            src,
-            src,
-            attn_mask=src_mask,
-            key_padding_mask=src_key_padding_mask,
-        )[0]
-        src = residual + self.dropout1(src2)
-        if not self.normalize_before:
-            src = self.norm1(src)
-
-        residual = src
-        if self.normalize_before:
-            src = self.norm2(src)
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = residual + self.dropout2(src2)
-        if not self.normalize_before:
-            src = self.norm2(src)
-        return src
-
-
-def _get_activation_fn(activation: str):
-    if activation == "relu":
-        return nn.functional.relu
-    elif activation == "gelu":
-        return nn.functional.gelu
-
-    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
-
-
-class PositionalEncoding(nn.Module):
-    """This class implements the positional encoding
-    proposed in the following paper:
-
-    - Attention Is All You Need: https://arxiv.org/pdf/1706.03762.pdf
-
-        PE(pos, 2i) = sin(pos / (10000^(2i/d_modle))
-        PE(pos, 2i+1) = cos(pos / (10000^(2i/d_modle))
-
-    Note::
-
-      1 / (10000^(2i/d_model)) = exp(-log(10000^(2i/d_model)))
-                               = exp(-1* 2i / d_model * log(100000))
-                               = exp(2i * -(log(10000) / d_model))
-    """
-
-    def __init__(self, d_model: int, dropout: float = 0.1) -> None:
-        """
-        Args:
-          d_model:
-            Embedding dimension.
-          dropout:
-            Dropout probability to be applied to the output of this module.
-        """
-        super().__init__()
-        self.d_model = d_model
-        self.xscale = math.sqrt(self.d_model)
-        self.dropout = nn.Dropout(p=dropout)
-        # not doing: self.pe = None because of errors thrown by torchscript
-        self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32)
-
-    def extend_pe(self, x: torch.Tensor) -> None:
-        """Extend the time t in the positional encoding if required.
-
-        The shape of `self.pe` is (1, T1, d_model). The shape of the input x
-        is (N, T, d_model). If T > T1, then we change the shape of self.pe
-        to (N, T, d_model). Otherwise, nothing is done.
-
-        Args:
-          x:
-            It is a tensor of shape (N, T, C).
-        Returns:
-          Return None.
-        """
-        if self.pe is not None:
-            if self.pe.size(1) >= x.size(1):
-                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
-        pe = torch.zeros(x.size(1), self.d_model, dtype=torch.float32)
-        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.d_model)
-        )
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0)
-        # Now pe is of shape (1, T, d_model), where T is x.size(1)
-        self.pe = pe.to(device=x.device, dtype=x.dtype)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Add positional encoding.
-
-        Args:
-          x:
-            Its shape is (N, T, C)
-
-        Returns:
-          Return a tensor of shape (N, T, C)
-        """
-        self.extend_pe(x)
-        x = x * self.xscale + self.pe[:, : x.size(1), :]
-        return self.dropout(x)
-
-
-class Noam(object):
-    """
-    Implements Noam optimizer.
-
-    Proposed in
-    "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
-
-    Modified from
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py  # noqa
-
-    Args:
-      params:
-        iterable of parameters to optimize or dicts defining parameter groups
-      model_size:
-        attention dimension of the transformer model
-      factor:
-        learning rate factor
-      warm_step:
-        warmup steps
-    """
-
-    def __init__(
-        self,
-        params,
-        model_size: int = 256,
-        factor: float = 10.0,
-        warm_step: int = 25000,
-        weight_decay=0,
-    ) -> None:
-        """Construct an Noam object."""
-        self.optimizer = torch.optim.Adam(
-            params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay
-        )
-        self._step = 0
-        self.warmup = warm_step
-        self.factor = factor
-        self.model_size = model_size
-        self._rate = 0
-
-    @property
-    def param_groups(self):
-        """Return param_groups."""
-        return self.optimizer.param_groups
-
-    def step(self):
-        """Update parameters and rate."""
-        self._step += 1
-        rate = self.rate()
-        for p in self.optimizer.param_groups:
-            p["lr"] = rate
-        self._rate = rate
-        self.optimizer.step()
-
-    def rate(self, step=None):
-        """Implement `lrate` above."""
-        if step is None:
-            step = self._step
-        return (
-            self.factor
-            * self.model_size ** (-0.5)
-            * min(step ** (-0.5), step * self.warmup ** (-1.5))
-        )
-
-    def zero_grad(self):
-        """Reset gradient."""
-        self.optimizer.zero_grad()
-
-    def state_dict(self):
-        """Return state_dict."""
-        return {
-            "_step": self._step,
-            "warmup": self.warmup,
-            "factor": self.factor,
-            "model_size": self.model_size,
-            "_rate": self._rate,
-            "optimizer": self.optimizer.state_dict(),
-        }
-
-    def load_state_dict(self, state_dict):
-        """Load state_dict."""
-        for key, value in state_dict.items():
-            if key == "optimizer":
-                self.optimizer.load_state_dict(state_dict["optimizer"])
-            else:
-                setattr(self, key, value)
--- a/egs/fluent_speech_commands/SLU/transducer/transformer.py
+++ b/egs/fluent_speech_commands/SLU/transducer/transformer.py
@ -0,0 +1 @@
+../../../librispeech/ASR/transducer_stateless/transformer.py
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/transducer_stateless/conformer.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/transducer/joiner.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/transducer/model.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/transducer/test_conformer.py`
				`@ -0,0 +1 @@`
				`../../../yesno/ASR/transducer/test_decoder.py`