From 8ddc832e4faca17007d7becf7fde760804244380 Mon Sep 17 00:00:00 2001 From: dohe0342 Date: Mon, 9 Jan 2023 19:34:16 +0900 Subject: [PATCH] from local --- .../ASR/incremental_transf/.conformer.py.swp | Bin 102400 -> 110592 bytes .../ASR/incremental_transf/.model.py.swp | Bin 24576 -> 24576 bytes .../ASR/incremental_transf/conformer.py | 170 ++++++++++++++++++ 3 files changed, 170 insertions(+) diff --git a/egs/librispeech/ASR/incremental_transf/.conformer.py.swp b/egs/librispeech/ASR/incremental_transf/.conformer.py.swp index 6a437dbfe2a3ba83ec8da0bef1d905b7dfc8655e..1159f1938490127d438b1d0004f398b15e1b3609 100644 GIT binary patch delta 1250 zcmXxjTSyd99Dwov@xJSAGcRdvu7Mh)p=p?4VS7~lEy*7o!hw8go_WWDPb@gwsyppix?ZwOMDaS|C zwBI?)LyK~=_y(}`lzu1|$kSG&s!r>$QDgx3a0xk#qi8Qeqz>QIWUh=2i8-0vXTpx^s* zPSk`N6d?d!Zdz}c3eqYvRFb24oie!PAFMoTySZVFb9#pgO}CaD;6R?+&I$B~|37`7 z{6+e45qpu0U@ZBG{J;<;aAmnV%7+UY~xw z?`9-ooc|7GJKG+py{cD6=iLAm@2)MLP=!vzDz)fon~x5(wM01n_B+qPX0D+3`~P~+ z(W)DFa2orN1ceD&y+I$^aRHUsgq2vL+YBC~8&^<(B&@+|e5GHj5B=;6&LSTfSdVB# zAril7x_}$VK{nE`M9VJZVlBq#cL}kGff;Y<)q^_hLMAehh$xu6==Y0udei~fpnrS0 z*bcp`5#u!cf*}mzFb-iGj2NTm5bP*GHg;kars%4FtIarxI7A^5FN2u&QMAI2ESOz; ohZ~eOe_VMvzin4825r)+!fg?bFlTYBYIo7Hvz0~bs#oKG07-Gd^8f$< delta 359 zcmXBQJxjx26o%pRrfq6!Yu+yDnnh5D5^?c^LKg>7CqWUUf)*5999 =IK-33RYR z#mPm)4^aO=MW|Z`D=s=ka1h+YyZXSjorl9Yl~=QJY&t96{M4*FmiL5ck%j1Xb@Th& ziENb%V){{)gD0K47|6E!s;%Old*!OQn;B-fqy8J^Fk4g&|2`#hi6vx^hJhPPWE~nW zJt8&C;3Fv#p@Tzgpa2_5nD{b9-qFGxLTqCWd1xp+B*d3yLZpr(_}IY^2JsUYX`%)n zB{)c<7wwqH3AV6^QG6OK#~Ti?k7bl#p?m*x_XmFo#&<`q`y0xzLT^pk;l!CroldLC EUnW*P$^ZZW diff --git a/egs/librispeech/ASR/incremental_transf/.model.py.swp b/egs/librispeech/ASR/incremental_transf/.model.py.swp index 508ef2ac884cdebb996ec171eb1bd9085236591e..dd7695fe9428fcecbaa0bd16788e2c9942e1bf16 100644 GIT binary patch delta 393 zcmZoTz}RqrQ7p+I%+puFQqO<^2m}}yUccF$Ec0Wd*iB}}(9Iv18+aH+Hx~*_*5?)G zVPI$mVvr)y$$}2y^&nmo5LW_m1rWOdu_O?G<7Qwu3B;>_xC4k2f!GL$|8g-f`~l)E zK%4`_vOp{Y#4kA+7+wJJJs>^|#JxZq4#X-z{FZ}(VH*%{1>)5}+y%thK!QNrpV@VK1RX-ToYte delta 244 zcmZoTz}RqrQ7p+I%+puFQqO<^2m}}yj=tHQ{Pz1sv75|{44XeNH}Ei;Z7vj;tk3&} zje+3>J4BJ$WI>1U`ZgX0hG-zR17bxWW&+}e+zbrYf%qB_&j;cRAT|Z!|6B|VCxLh! z5HA7Z#XwvD#Q8w10K_*q85kx2aX%180kJj^vjFiU4hDw9Ks*hIi-Fh;h!ufYn1g|x zK?ulr!M^#Og9Q8J0B_03``maZZ}4KBywBcgGPke5WCvg6&G!O%*(UEY;@y0}&p{9X DuAVh! diff --git a/egs/librispeech/ASR/incremental_transf/conformer.py b/egs/librispeech/ASR/incremental_transf/conformer.py index 822bc7a27..a2f56ff43 100644 --- a/egs/librispeech/ASR/incremental_transf/conformer.py +++ b/egs/librispeech/ASR/incremental_transf/conformer.py @@ -403,6 +403,176 @@ class Conformer(EncoderInterface): return x, lengths, states +class Tempformer(EncoderInterface): + """ + Args: + num_features (int): Number of input features + subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) + d_model (int): attention dimension, also the output dimension + nhead (int): number of head + dim_feedforward (int): feedforward dimention + num_encoder_layers (int): number of encoder layers + dropout (float): dropout rate + layer_dropout (float): layer-dropout rate. + cnn_module_kernel (int): Kernel size of convolution module. + dynamic_chunk_training (bool): whether to use dynamic chunk training, if + you want to train a streaming model, this is expected to be True. + When setting True, it will use a masking strategy to make the attention + see only limited left and right context. + short_chunk_threshold (float): a threshold to determinize the chunk size + to be used in masking training, if the randomly generated chunk size + is greater than ``max_len * short_chunk_threshold`` (max_len is the + max sequence length of current batch) then it will use + full context in training (i.e. with chunk size equals to max_len). + This will be used only when dynamic_chunk_training is True. + short_chunk_size (int): see docs above, if the randomly generated chunk + size equals to or less than ``max_len * short_chunk_threshold``, the + chunk size will be sampled uniformly from 1 to short_chunk_size. + This also will be used only when dynamic_chunk_training is True. + num_left_chunks (int): the left context (in chunks) attention can see, the + chunk size is decided by short_chunk_threshold and short_chunk_size. + A minus value means seeing full left context. + This also will be used only when dynamic_chunk_training is True. + causal (bool): Whether to use causal convolution in conformer encoder + layer. This MUST be True when using dynamic_chunk_training. + """ + + def __init__( + self, + num_features: int, + subsampling_factor: int = 4, + d_model: int = 256, + nhead: int = 4, + dim_feedforward: int = 2048, + num_encoder_layers: int = 12, + dropout: float = 0.1, + layer_dropout: float = 0.075, + cnn_module_kernel: int = 31, + aux_layer_period: int = 3, + dynamic_chunk_training: bool = False, + short_chunk_threshold: float = 0.75, + short_chunk_size: int = 25, + num_left_chunks: int = -1, + causal: bool = False, + ) -> None: + super(Conformer, self).__init__() + + self.num_features = num_features + self.subsampling_factor = subsampling_factor + if subsampling_factor != 4: + raise NotImplementedError("Support only 'subsampling_factor=4'.") + + # self.encoder_embed converts the input of shape (N, T, num_features) + # to the shape (N, T//subsampling_factor, d_model). + # That is, it does two things simultaneously: + # (1) subsampling: T -> T//subsampling_factor + # (2) embedding: num_features -> d_model + self.encoder_embed = Conv2dSubsampling(num_features, d_model) + + self.encoder_pos = RelPositionalEncoding(d_model, dropout) + + self.encoder_layers = num_encoder_layers + self.d_model = d_model + self.cnn_module_kernel = cnn_module_kernel + self.causal = causal + self.dynamic_chunk_training = dynamic_chunk_training + self.short_chunk_threshold = short_chunk_threshold + self.short_chunk_size = short_chunk_size + self.num_left_chunks = num_left_chunks + + encoder_layer = ConformerEncoderLayer( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + layer_dropout=layer_dropout, + cnn_module_kernel=cnn_module_kernel, + causal=causal, + ) + # aux_layers from 1/3 + self.encoder = ConformerEncoder( + encoder_layer=encoder_layer, + num_layers=num_encoder_layers, + aux_layers=list( + range( + num_encoder_layers // 3, + num_encoder_layers - 1, + aux_layer_period, + ) + ), + ) + self._init_state: List[torch.Tensor] = [torch.empty(0)] + + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor, warmup: float = 1.0, get_layer_output = False, + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]]]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + warmup: + A floating point value that gradually increases from 0 throughout + training; when it is >= 1.0 we are "fully warmed up". It is used + to turn modules on sequentially. + Returns: + Return a tuple containing 2 tensors: + - embeddings: its shape is (batch_size, output_seq_len, d_model) + - lengths, a tensor of shape (batch_size,) containing the number + of frames in `embeddings` before padding. + """ + x = self.encoder_embed(x) + x, pos_emb = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + lengths = (((x_lens - 1) >> 1) - 1) >> 1 + assert x.size(0) == lengths.max().item() + src_key_padding_mask = make_pad_mask(lengths) + + if self.dynamic_chunk_training: + assert ( + self.causal + ), "Causal convolution is required for streaming conformer." + max_len = x.size(0) + chunk_size = torch.randint(1, max_len, (1,)).item() + if chunk_size > (max_len * self.short_chunk_threshold): + chunk_size = max_len + else: + chunk_size = chunk_size % self.short_chunk_size + 1 + + mask = ~subsequent_chunk_mask( + size=x.size(0), + chunk_size=chunk_size, + num_left_chunks=self.num_left_chunks, + device=x.device, + ) + x = self.encoder( + x, + pos_emb, + mask=mask, + src_key_padding_mask=src_key_padding_mask, + warmup=warmup, + ) # (T, N, C) + else: + x, layer_outputs = self.encoder( + x, + pos_emb, + src_key_padding_mask=src_key_padding_mask, + warmup=warmup, + ) # (T, N, C) + + x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + layer_outputs = [x.permute(1, 0, 2) for x in layer_outputs] + + if get_layer_output: + return x, lengths, layer_outputs + else: + return x, lengths + + + class ConformerEncoderLayer(nn.Module): """ ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.