from local

2025-12-11 06:55:27 +00:00 · 2023-02-02 11:36:43 +09:00 · 2023-02-02 11:36:43 +09:00 · ab44c2d54a
commit ab44c2d54a
parent 203cedb453
3 changed files with 1 additions and 128 deletions
--- a/egs/librispeech/ASR/conformer_ctc2/.conformer.py.swp
+++ b/egs/librispeech/ASR/conformer_ctc2/.conformer.py.swp
--- a/egs/librispeech/ASR/conformer_ctc2/.transformer.py.swp
+++ b/egs/librispeech/ASR/conformer_ctc2/.transformer.py.swp
--- a/egs/librispeech/ASR/conformer_ctc2/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py
@ -31,7 +31,7 @@ from scaling import (
 )
 from subsampling import Conv2dSubsampling
 from torch import Tensor, nn
-from transformer import Supervisions, Transformer, encoder_padding_mask
+from transformer import Supervisions, Transformer, encoder_padding_mask, TransformerEncoder, TransformerEncoder


 class Conformer(Transformer):
@ -161,133 +161,6 @@ class Conformer(Transformer):
        return x, mask


-class TransfEnc(Transformer):
-    """
-    Args:
-        num_features (int): Number of input features
-        num_classes (int): Number of output classes
-        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
-        d_model (int): attention dimension, also the output dimension
-        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
-        num_encoder_layers (int): number of encoder layers
-        num_decoder_layers (int): number of decoder layers
-        dropout (float): dropout rate
-        layer_dropout (float): layer-dropout rate.
-        cnn_module_kernel (int): Kernel size of convolution module
-        vgg_frontend (bool): whether to use vgg frontend.
-    """
-
-    def __init__(
-        self,
-        num_features: int,
-        num_classes: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        num_decoder_layers: int = 6,
-        dropout: float = 0.1,
-        layer_dropout: float = 0.075,
-        cnn_module_kernel: int = 31,
-        group_num: int = 0,
-    ) -> None:
-        super(TransfEnc, self).__init__(
-            num_features=num_features,
-            num_classes=num_classes,
-            subsampling_factor=subsampling_factor,
-            d_model=d_model,
-            nhead=nhead,
-            dim_feedforward=dim_feedforward,
-            num_encoder_layers=num_encoder_layers,
-            num_decoder_layers=num_decoder_layers,
-            dropout=dropout,
-            layer_dropout=layer_dropout,
-        )
-
-        self.num_features = num_features
-        self.subsampling_factor = subsampling_factor
-        if subsampling_factor != 4:
-            raise NotImplementedError("Support only 'subsampling_factor=4'.")
-
-        # self.encoder_embed converts the input of shape (N, T, num_features)
-        # to the shape (N, T//subsampling_factor, d_model).
-        # That is, it does two things simultaneously:
-        #   (1) subsampling: T -> T//subsampling_factor
-        #   (2) embedding: num_features -> d_model
-        self.encoder_embed = Conv2dSubsampling(num_features, d_model)
-
-        self.encoder_pos = RelPositionalEncoding(d_model, dropout)
-
-        encoder_layer = ConformerEncoderLayer(
-            d_model,
-            nhead,
-            dim_feedforward,
-            dropout,
-            layer_dropout,
-            cnn_module_kernel,
-        )
-        self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers)
-        
-        self.group_num = group_num
-        if self.group_num != 0:
-            self.group_layer_num = int(num_encoder_layers // self.group_num)
-            self.alpha = nn.Parameter(torch.rand(self.group_num))
-            self.sigmoid = nn.Sigmoid()
-            self.layer_norm = nn.LayerNorm(d_model)
-
-    def run_encoder(
-        self,
-        x: torch.Tensor,
-        supervisions: Optional[Supervisions] = None,
-        warmup: float = 1.0,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Args:
-          x:
-            The input tensor. Its shape is (batch_size, seq_len, feature_dim).
-          supervisions:
-            Supervision in lhotse format.
-            See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32  # noqa
-            CAUTION: It contains length information, i.e., start and number of
-            frames, before subsampling
-            It is read directly from the batch, without any sorting. It is used
-            to compute encoder padding mask, which is used as memory key padding
-            mask for the decoder.
-          warmup:
-            A floating point value that gradually increases from 0 throughout
-            training; when it is >= 1.0 we are "fully warmed up".  It is used
-            to turn modules on sequentially.
-        Returns:
-            Tensor: Predictor tensor of dimension (input_length, batch_size, d_model).
-            Tensor: Mask tensor of dimension (batch_size, input_length)
-        """
-        x = self.encoder_embed(x)
-        x, pos_emb = self.encoder_pos(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-        mask = encoder_padding_mask(x.size(0), supervisions)
-        if mask is not None:
-            mask = mask.to(x.device)
-
-        # Caution: We assume the subsampling factor is 4!
-
-        x, layer_outputs = self.encoder(
-            x, pos_emb, src_key_padding_mask=mask, warmup=warmup
-        )  # (T, N, C)
-        
-        if self.group_num != 0:
-            x = 0
-            for enum, alpha in enumerate(self.alpha):
-                x += self.sigmoid(alpha) * layer_outputs[(enum+1)*self.group_layer_num-1]
-            x = self.layer_norm(x/self.group_num)
-        # x = x.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
-        
-
-        # return x, lengths
-        return x, mask
-
-
 class ConformerEncoderLayer(nn.Module):
    """
    ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.