From 8ddc832e4faca17007d7becf7fde760804244380 Mon Sep 17 00:00:00 2001
From: dohe0342 <kimdohe1070@gmail.com>
Date: Mon, 9 Jan 2023 19:34:16 +0900
Subject: [PATCH] from local

---
 .../ASR/incremental_transf/.conformer.py.swp  | Bin 102400 -> 110592 bytes
 .../ASR/incremental_transf/.model.py.swp      | Bin 24576 -> 24576 bytes
 .../ASR/incremental_transf/conformer.py       | 170 ++++++++++++++++++
 3 files changed, 170 insertions(+)
diff --git a/egs/librispeech/ASR/incremental_transf/.conformer.py.swp b/egs/librispeech/ASR/incremental_transf/.conformer.py.swp
index 6a437dbfe2a3ba83ec8da0bef1d905b7dfc8655e..1159f1938490127d438b1d0004f398b15e1b3609 100644
GIT binary patch
delta 1250
zcmXxjTSyd99Dwov@xJSAGcRdvu7Mh)p=p?4VS<XKBq@SYnObOtQua`!wiN`KfyI`S
zQtJ)Mlqgwil}IzGUMwi+W|2ykBq_d#J{0s#*}!k-?94Ei^Ua)T^Efl%5nLKqkd<Re
z-WVq=CXw5-kIH(MdV_!Zt5BP(>7~lEy*7o!hw8go_WWDPb@gwsyppix?ZwOMDaS|C
zwBI?)LyK~=_y(}`lzu1|$kSG&s!r>$QDgx3a0<S7VG!v=6%yc!F)xuOq+`*O{iuQ&
zqaGr)NXCr2NC$R9q0ddE9Dx{d6{$xG=3GQ>xk#qi8Qeqz>QIWUh=2i8-0vXTpx^s*
zPSk`N6d?d!Zdz}c3eqYvRFb24oie!PAFMoTySZVFb9#pgO}CaD;6R?+&I$B~|37`7
z{6+e45qpu0U@ZBG{J;<faSi*Cf;nH2S+t@Iu~_gCd5T)pU<;DqfoX5CN}sogh9ZPx
zj%gmkAR1APLd3uy-<k3UXu)MvBMTNRkl+J`aTOIvLmVOyg7;*3hi549w6c+bH-6J-
z9_?sEDlG8B6M9u69f@#9Kht{z35bR}KGUxg9XN(m7-7I9$wqM><;aAmnV%7+UY~xw
z?`9-ooc|7GJKG+py{cD6=iLAm@2)MLP=!vzDz)fon~x5(wM01n_B+qPX0D+3`~P~+
z(W)DFa2orN1ceD&y+I$^aRHUsgq2vL+YBC~8&^<(B&@+|e5GHj5B=;6&LSTfSdVB#
zAril7x_}$VK{nE`M9VJZVlBq#cL}kGff;Y<)q^_hLMAehh$xu6==Y0udei~fpnrS0
z*bcp`5#u!cf*}mzFb-iGj2NTm5bP*GHg;kars%4FtIarxI7A^5FN2u&QMAI2ESOz;
ohZ~eOe_VMvzin4825r)+!fg?bFlTYBYIo7Hvz0~bs#oKG07-Gd^8f$<

delta 359
zcmXBQJxjx26o%pRrfq6!Yu+yDnnh5D5^?c^LKg>7CqWUUf)*5999&#Xb=IK-33RYR
z#mPm)4^aO=MW|Z`D=s=ka1h+YyZXSjorl9Yl~=QJY&t96{M4*FmiL5ck%j1Xb@Th&
ziENb%V){{)gD0K47|6E!s;%Old*!OQn;B-fqy8J^Fk4g&|2`#hi6vx^hJhPPWE~nW
zJt8&C;3Fv#p@Tzgpa2_5nD{b9-qFGxLTqCWd1xp+B*d3yLZpr(_}IY^2JsUYX`%)n
zB{)c<7wwqH3AV6^QG6OK#~Ti?k7bl#p?m*x_XmFo#&<`q`y0xzLT^pk;l!CroldLC
EUnW*P$^ZZW

diff --git a/egs/librispeech/ASR/incremental_transf/.model.py.swp b/egs/librispeech/ASR/incremental_transf/.model.py.swp
index 508ef2ac884cdebb996ec171eb1bd9085236591e..dd7695fe9428fcecbaa0bd16788e2c9942e1bf16 100644
GIT binary patch
delta 393
zcmZoTz}RqrQ7p+I%+puFQqO<^2m}}yUccF$Ec0Wd*iB}}(9Iv18+aH+Hx~*_*5?)G
zVPI$mVvr)y$$}2y^&nmo5LW_m1rWOdu_O?G<7Qwu3B;>_xC4k2f!GL$|8g-f`~l)E
zK%4`_vOp{Y#4kA+7+wJJJs>^|#JxZq4#X-z{FZ}(VH*%{1>)5}+y%thK<vlCz|LR@
zWUz5;e&-;;J~`3VcygwzD3F!~VtF7|<WhiwqSVO=uC|kBxf)I0XT&jiuB-lJepl7W
z8{GJSe0DG&LJOm6sh_;xo)u`4`DA$yp~(%#Af=8#8e+sgBQ~(sxnSFrCf{%Y*=joZ
fw3{@PmK8_1pro`2#!)B&I>!QNrpV@VK1RX-ToYte

delta 244
zcmZoTz}RqrQ7p+I%+puFQqO<^2m}}yj=tHQ{Pz1sv75|{44XeNH}Ei;Z7vj;tk3&}
zje+3>J4BJ$WI>1U`ZgX0hG-zR17bxWW&+}e+zbrYf%qB_&j;cRAT|Z!|6B|VCxLh!
z5HA7Z#XwvD#Q8w10K_*q85kx2aX%180kJj^vjFiU4hDw9Ks*hIi-Fh;h!ufYn1g|x
zK?ulr!M^#Og9Q8J0B_03``maZZ}4KBywBcgGPke5WCvg6&G!O%*(UEY;@y0}&p{9X
DuAVh!

diff --git a/egs/librispeech/ASR/incremental_transf/conformer.py b/egs/librispeech/ASR/incremental_transf/conformer.py
index 822bc7a27..a2f56ff43 100644
--- a/egs/librispeech/ASR/incremental_transf/conformer.py
+++ b/egs/librispeech/ASR/incremental_transf/conformer.py
@@ -403,6 +403,176 @@ class Conformer(EncoderInterface):
         return x, lengths, states
 
 
+class Tempformer(EncoderInterface):
+    """
+    Args:
+        num_features (int): Number of input features
+        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
+        d_model (int): attention dimension, also the output dimension
+        nhead (int): number of head
+        dim_feedforward (int): feedforward dimention
+        num_encoder_layers (int): number of encoder layers
+        dropout (float): dropout rate
+        layer_dropout (float): layer-dropout rate.
+        cnn_module_kernel (int): Kernel size of convolution module.
+        dynamic_chunk_training (bool): whether to use dynamic chunk training, if
+            you want to train a streaming model, this is expected to be True.
+            When setting True, it will use a masking strategy to make the attention
+            see only limited left and right context.
+        short_chunk_threshold (float): a threshold to determinize the chunk size
+            to be used in masking training, if the randomly generated chunk size
+            is greater than ``max_len * short_chunk_threshold`` (max_len is the
+            max sequence length of current batch) then it will use
+            full context in training (i.e. with chunk size equals to max_len).
+            This will be used only when dynamic_chunk_training is True.
+        short_chunk_size (int): see docs above, if the randomly generated chunk
+            size equals to or less than ``max_len * short_chunk_threshold``, the
+            chunk size will be sampled uniformly from 1 to short_chunk_size.
+            This also will be used only when dynamic_chunk_training is True.
+        num_left_chunks (int): the left context (in chunks) attention can see, the
+            chunk size is decided by short_chunk_threshold and short_chunk_size.
+            A minus value means seeing full left context.
+            This also will be used only when dynamic_chunk_training is True.
+        causal (bool): Whether to use causal convolution in conformer encoder
+            layer. This MUST be True when using dynamic_chunk_training.
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        subsampling_factor: int = 4,
+        d_model: int = 256,
+        nhead: int = 4,
+        dim_feedforward: int = 2048,
+        num_encoder_layers: int = 12,
+        dropout: float = 0.1,
+        layer_dropout: float = 0.075,
+        cnn_module_kernel: int = 31,
+        aux_layer_period: int = 3,
+        dynamic_chunk_training: bool = False,
+        short_chunk_threshold: float = 0.75,
+        short_chunk_size: int = 25,
+        num_left_chunks: int = -1,
+        causal: bool = False,
+    ) -> None:
+        super(Conformer, self).__init__()
+
+        self.num_features = num_features
+        self.subsampling_factor = subsampling_factor
+        if subsampling_factor != 4:
+            raise NotImplementedError("Support only 'subsampling_factor=4'.")
+
+        # self.encoder_embed converts the input of shape (N, T, num_features)
+        # to the shape (N, T//subsampling_factor, d_model).
+        # That is, it does two things simultaneously:
+        #   (1) subsampling: T -> T//subsampling_factor
+        #   (2) embedding: num_features -> d_model
+        self.encoder_embed = Conv2dSubsampling(num_features, d_model)
+
+        self.encoder_pos = RelPositionalEncoding(d_model, dropout)
+
+        self.encoder_layers = num_encoder_layers
+        self.d_model = d_model
+        self.cnn_module_kernel = cnn_module_kernel
+        self.causal = causal
+        self.dynamic_chunk_training = dynamic_chunk_training
+        self.short_chunk_threshold = short_chunk_threshold
+        self.short_chunk_size = short_chunk_size
+        self.num_left_chunks = num_left_chunks
+
+        encoder_layer = ConformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            layer_dropout=layer_dropout,
+            cnn_module_kernel=cnn_module_kernel,
+            causal=causal,
+        )
+        # aux_layers from 1/3
+        self.encoder = ConformerEncoder(
+            encoder_layer=encoder_layer,
+            num_layers=num_encoder_layers,
+            aux_layers=list(
+                range(
+                    num_encoder_layers // 3,
+                    num_encoder_layers - 1,
+                    aux_layer_period,
+                )
+            ),
+        )
+        self._init_state: List[torch.Tensor] = [torch.empty(0)]
+
+    def forward(
+        self, x: torch.Tensor, x_lens: torch.Tensor, warmup: float = 1.0, get_layer_output = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]]]:
+        """
+        Args:
+          x:
+            The input tensor. Its shape is (batch_size, seq_len, feature_dim).
+          x_lens:
+            A tensor of shape (batch_size,) containing the number of frames in
+            `x` before padding.
+          warmup:
+            A floating point value that gradually increases from 0 throughout
+            training; when it is >= 1.0 we are "fully warmed up".  It is used
+            to turn modules on sequentially.
+        Returns:
+          Return a tuple containing 2 tensors:
+            - embeddings: its shape is (batch_size, output_seq_len, d_model)
+            - lengths, a tensor of shape (batch_size,) containing the number
+              of frames in `embeddings` before padding.
+        """
+        x = self.encoder_embed(x)
+        x, pos_emb = self.encoder_pos(x)
+        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+        lengths = (((x_lens - 1) >> 1) - 1) >> 1
+        assert x.size(0) == lengths.max().item()
+        src_key_padding_mask = make_pad_mask(lengths)
+
+        if self.dynamic_chunk_training:
+            assert (
+                self.causal
+            ), "Causal convolution is required for streaming conformer."
+            max_len = x.size(0)
+            chunk_size = torch.randint(1, max_len, (1,)).item()
+            if chunk_size > (max_len * self.short_chunk_threshold):
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % self.short_chunk_size + 1
+
+            mask = ~subsequent_chunk_mask(
+                size=x.size(0),
+                chunk_size=chunk_size,
+                num_left_chunks=self.num_left_chunks,
+                device=x.device,
+            )
+            x = self.encoder(
+                x,
+                pos_emb,
+                mask=mask,
+                src_key_padding_mask=src_key_padding_mask,
+                warmup=warmup,
+            )  # (T, N, C)
+        else:
+            x, layer_outputs = self.encoder(
+                x,
+                pos_emb,
+                src_key_padding_mask=src_key_padding_mask,
+                warmup=warmup,
+            )  # (T, N, C)
+
+        x = x.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+        layer_outputs = [x.permute(1, 0, 2) for x in layer_outputs]
+
+        if get_layer_output:
+            return x, lengths, layer_outputs
+        else:
+            return x, lengths
+
+
+
 class ConformerEncoderLayer(nn.Module):
     """
     ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.