From d61e27625f0d0af0b7a6b3062006057e368eddb7 Mon Sep 17 00:00:00 2001 From: dohe0342 Date: Mon, 9 Jan 2023 19:43:22 +0900 Subject: [PATCH] from local --- .../ASR/incremental_transf/.conformer.py.swp | Bin 114688 -> 114688 bytes .../ASR/incremental_transf/conformer.py | 64 ++---------------- 2 files changed, 4 insertions(+), 60 deletions(-) diff --git a/egs/librispeech/ASR/incremental_transf/.conformer.py.swp b/egs/librispeech/ASR/incremental_transf/.conformer.py.swp index d8c62d0ccb038f352c3ef26127beaeb87cb0070a..c70f2b9218662a177e50d0ad37152e0df086fd7b 100644 GIT binary patch delta 1201 zcmZwFUr19?90%~e4--$$|NJO&q zG&ldIz&C%S~1aY zSb$d$gnF<;CCnEQT?HrnC?uKz4f4SNt7f9dFbrMb02ORd2x}&yPw>`6VTvIb0}mVl zCse>knP?H7LLA0{!yz~b<*;bP&0!pdpb6|y2j2>a5-(!D1=}L zYC(L7@9+*@!fgn{6=;JTV6cR*_5!YhAI`&Wu!9Vr^N3<_3ogR|9E1H(1xvX^voL8! z7XUvr#YgY5dyHG}v%^dkm+hiUMhzm`E4rE@HQoH0r@ygBi)>WfefYRI9bZ74KoeJD z%oe_c!*c*kAcL5WfDcZJQdq_TnuIYJh7g>FVt9?i^91_gDC_}opv^zWpI zDx-H21|?hdPmwI8Q`u$Z)=ZJAYk{EdRXy&Dntsj~3|t5X{L#4++gdemudh$j9ig+D zH#pesbY^eR@6%P0+krk=qYikJ2~BqgGzXs^XJt9gHa_u)Rj_XUA%|J`!&!Fb4^bTh As{jB1 delta 680 zcmXZaF=$gk9LMqhzs4rMrfHr+5H(FoZK_}-5wVCM=uinwv4dvxQPV|B>)@b+`VvG? zv4xW1P$VIOSlgvi;US{Yp_7WJh|s}QlnxHXMJSSj-xv=*$H5&O_xoL;;4c*Xk+Z2> zW;lL$e@Yl(k)LbN^F56xkw0PWE!lQVo$G1aOWXQs+P$G7(M_#$PV+ottxIGMk8urW zFaX6$NaQ1)qKccCzyWN-dMEec1?Dh=90srpJMldzQb9Q=t`u3E!XWmb9|`nfqeJ93 z?qe8P9LKsPGK&#(V|k0n9VD?2QG5=F%;PFf;sge95DA3Q@Vg>k{36d#!X!p;3>~mg zZx{KB8eZZ&&fzF5EN^BHW#o{>5VqrokEC!HmytpOeRxOeZ-Yz~l#%hKs`@}p`JwLh z(vS3gz;2y@1#}PCOvGAJks~AP}e-Zz_Olw*U`7)OIby2O-!iRS6 Z)r{`;nH5!^sa?`>v)ZO%Ghfrwe*s1Iba?;( diff --git a/egs/librispeech/ASR/incremental_transf/conformer.py b/egs/librispeech/ASR/incremental_transf/conformer.py index 04f3b78ed..ca1137f31 100644 --- a/egs/librispeech/ASR/incremental_transf/conformer.py +++ b/egs/librispeech/ASR/incremental_transf/conformer.py @@ -510,67 +510,11 @@ class Tempformer(EncoderInterface): def forward( self, x ): - """ - Args: - x: - The input tensor. Its shape is (batch_size, seq_len, feature_dim). - x_lens: - A tensor of shape (batch_size,) containing the number of frames in - `x` before padding. - Returns: - Return a tuple containing 2 tensors: - - embeddings: its shape is (batch_size, output_seq_len, d_model) - - lengths, a tensor of shape (batch_size,) containing the number - of frames in `embeddings` before padding. - """ - x = self.encoder_embed(x) - x, pos_emb = self.encoder_pos(x) - x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) - - lengths = (((x_lens - 1) >> 1) - 1) >> 1 - assert x.size(0) == lengths.max().item() - src_key_padding_mask = make_pad_mask(lengths) - - if self.dynamic_chunk_training: - assert ( - self.causal - ), "Causal convolution is required for streaming conformer." - max_len = x.size(0) - chunk_size = torch.randint(1, max_len, (1,)).item() - if chunk_size > (max_len * self.short_chunk_threshold): - chunk_size = max_len - else: - chunk_size = chunk_size % self.short_chunk_size + 1 - - mask = ~subsequent_chunk_mask( - size=x.size(0), - chunk_size=chunk_size, - num_left_chunks=self.num_left_chunks, - device=x.device, - ) - x = self.encoder( - x, - pos_emb, - mask=mask, - src_key_padding_mask=src_key_padding_mask, - warmup=warmup, - ) # (T, N, C) - else: - x, layer_outputs = self.encoder( - x, - pos_emb, - src_key_padding_mask=src_key_padding_mask, - warmup=warmup, - ) # (T, N, C) - - x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) - layer_outputs = [x.permute(1, 0, 2) for x in layer_outputs] - - if get_layer_output: - return x, lengths, layer_outputs - else: - return x, lengths + layer_outputs = [] + for enum, encoder in enumerate(self.encoder_layers): + layer_outputs.append(encoder(x[enum])) + return layer_outputs class ConformerEncoderLayer(nn.Module):