From ab44c2d54ab868c02b790e3d2eb2c3976b6c6638 Mon Sep 17 00:00:00 2001 From: dohe0342 Date: Thu, 2 Feb 2023 11:36:43 +0900 Subject: [PATCH] from local --- .../ASR/conformer_ctc2/.conformer.py.swp | Bin 61440 -> 65536 bytes .../ASR/conformer_ctc2/.transformer.py.swp | Bin 4096 -> 16384 bytes .../ASR/conformer_ctc2/conformer.py | 129 +----------------- 3 files changed, 1 insertion(+), 128 deletions(-) diff --git a/egs/librispeech/ASR/conformer_ctc2/.conformer.py.swp b/egs/librispeech/ASR/conformer_ctc2/.conformer.py.swp index c0db4e3ac593e3f33f881fc5cf25283995d59541..18cdf0b7417a439ee8eb4876cc866bea6420c4bd 100644 GIT binary patch delta 1102 zcmZ|OTSydP6u|MbyR+)vY>h}-N~Y9RQwUA%At6E2LX?z{f{H}92=lg^zKHCa5oH+F z7$Oyw1W_1@)0?FRc7rnLK_d%{q8!cFwyFnsuC6M2HOD8qWpTSW%3 z3ocAkh*uV|%B0p)xxu1pcPlroRabqXR<`qyiw2Y+1;$>XHdwh6o0)G%(Vx8&+g#@M9z1#*3WBRxEl%9zw?*96}BhOuI#<@EW6dfIf8L z2nykIi%%AK{40hrgfr+uoqpr!Gp9Dz>Co%zdzFFaNVukMxmQZ)C8N`d|H}6lxy|7n zrqGW77Rhr27jY0t_)LC}(T!4gV26?SJb8V=OAMhKji`1@ShAV8VC4G(1L!~vs*nLI zK2p^C=)ozJVjVs?L?&?!mvITlk%M?l#EFcf1$kJC6?u+cRK%w9r)A=V1MjKAQ;gs$ aA_$-)Mz<)cT^sL=(^Jc475%z@M)?hI1?a>8 delta 511 zcmXZXJxD@P6bJD0e9ur*QyRn~c!NnIK}|^wMbH-%B$-Pl*@sJ|rR37!*Cf(VH=1gQ z0vjTu3qcV?L(x!S(8p*HZ9xq|gOL5jcj3qXTsWM2ZXzKi-o)cCQ5fgnqO2E!%xtn~ z)l@xx<~CUzl5+DYd7z}AWQ~$GzHI6c_^sJxTrbEQvSg^tIjgjph%i~ss-v|;5xy(O zbZo!uVudo7+wAQTPu zg=Si+RXDOZ7jbm*Rc}TcKh@z$e6sRo8=`W`bwq#g0uQhPc4&r1cr+3v!2!PpA`U08 z2NTc^B0TGf?%@v3;SjbU1(VRNrv|d3_`w1nkcT5kLyGTA-RRh5Qo|SKHzi&U3yFVt Cb7GnR diff --git a/egs/librispeech/ASR/conformer_ctc2/.transformer.py.swp b/egs/librispeech/ASR/conformer_ctc2/.transformer.py.swp index 067f2447ee4dea3da2918ce76b4e7f68ef469101..a2e8f45e2bb328795c8ddd1d400d698fe77a3f15 100644 GIT binary patch literal 16384 zcmeHOU5q4E6)t`RVfl5V_+a8?=3%?tp6Z!q02$M0@650}GCQ+O_rk(3G}TqNyNj)^ zYU)<)Fq_Da%L7kD^ueex7$YVoizY^5LR?HVkwjvm3yJ{)BtG~cBql;c!JqHkd#kIe zXV@k21xDTbB;98_a0~pDPTb_R^Q_SH6AvytVy?;RJm? zea&o*y3X`j03DCrse_iiF9?nv3Io#%Y|9(xxjzwD-3-E`VrbdC>e{?(d0v(GMAdUU zp(}=*+x=>zxmL9UzZ--D9=7esu2rMZ@`Y+Jhg;*qi|4`ug$2%HflG~RYBO@FPflFT zc7NpXS?G)I!UBZ_3JVk#C@fG|ps+w;fx-fX1^%Bb5DhOj9>9do)ss4^|DH4U_uu-{ z*6m4gpV0kF_jh&wb7TDf9OIwSj0yj{y8m9y_ocD^cXa%b>~w=On} zH-WzbzXg5{JOz9mco0|zJ`3yt&ISH(kzxE8_!=Mp2RH(J9;gB518;oTFrERv1+;)q z0+#@PzR)mU0-gY3;0|CH@Wus(@htE?-~=!P4gkA>3xU@^1YY20z&C&{un&0ieCQ2) z7q|zQ1>VK|!7qV_ffK+yun)KrcoTOMF9Y8P9s~NoUBCuV11= zPBZQZYcTXk0(q~?ttbw;s4y)rweTbiSE`kn6yShiPP*RqEFrjHQ#U3oBkxF#nKBdU zAakrml&xwt)7b7>b`*pezEy5YgF~q6595dh-K_LV)&LoMGkikJ6`ZXE(Gsl7 z2i%Xivlxa!Sej_Y!(k9YnD1?|J=^V2pWS0lq;_{PW@a z<|(}>VZn%}9)d~Pf?C<|1=Fjj>TsEDvUHk1H4d?^#0dM4{HHn8U?R&%lhVC9d%c$o zJ^Fjd$gOaZm(fBfAvsYjA+K;>U=fp66k?gAqa~khruod2AL(E$A||ZJ72U05WwL6O z$y~+ibvB)yd)7jf(gY1n>&%^^M`lG4LKwn=l?rl|e zh*oy+aI?iaoXuwCWYyK??1t6=iNX~!I9fFovs8vPYH9YYmUj)g*7{EPG5g*x(_AGB-1~W_U^D2 z5D^i?T$d*r3wBL3Q%kAF_Rl#@Qk$OrbV{T`Nq5br?Mc8SOxI?H-@$R)1!NsUy{TyG z8Cf~sMVV0%I=st9>CBLcGJ7H^`M!BDaAJ>_=WtL$X6NBCmliec(hIH~t&1qEFzjx2 z-)v#M2jPY>N&5oFR!FT)J9LK;3;DgVi;Y(@RhL~6Re5Ba;H2R1#vb;HXluwF6X(A$ z7-*3gKaN~an4#}8cOdtbq2;s%#KARO(Pk`_5N+H34VA2$c)ylA%dXF@P?5GK_YkPj(|8YR{0Ssm`R z#UKdeCZF^!Q*$$ET@Hj$^iKo#jGJluq^}!X0%CTTD=gbX$oX) zDVrhAtq39}f^LN4H8&<9hd%u%8qQU#o12@ag*UkS+zi4VUfq@!6xHRW`Nfsy;#3W9 zloQMCLCgl$77IF&1-TX@Pb533wmWt$P;O~TKZsb2GZ`XGu&xUQ`F%FW(gV94kYOlJ z*(QvZ^%U7k%gY+7A$3f8D(qI&KeU%aKUSjMd_pr#ls_XCrT{R_>gYc`AsGnBD?gfZm%Da)oRnV8e>OXD;T&q(uYHnTHpIqQj+G_ zEioc0z!y?C_fG;tzy$t=I{!Do2_OJ&1nR&PPyy&J;CHC~p9D?<_XCH3 zRbUag9(Wt|{-1y+fs?@Fz+*rQSOz`?Tm^g#_$crq>i$Q8&jFVJyMWhF+rJ8&0=@}6 z0Nf8mz}-L_SOZD`171a)e;POiJPLGxJAq~3W`OPk&H;Xpy8lJsSHMqzhk*US4Zz2N ztAJg=uW=7>3U~^57&rlZ5x4=k61W1m0C)@c0Z#)D0wHhp0bJ;D~8PfrrOZ zSsmn1ARHFhcd;jfg~|mw%W9fvPD@iqQdvZdWYI}BGwgaCMQkry=YeRu&W7Bus0rI< z6ddI~8pVcWNNN(MHqpgZ92XXn>S#?(!eTz(#BoVPQ$cr%{#Mu~PAS-pvFTH(Ms29? z9U)~ZHmF1z(TB>hrG;dtr!k}!$uLUGv>1||V89;5&7O(Dg3w_@mt(6(YncRL&%%T_ zE*kC3jYA`+r5uo^OI#*f%qokEB07X9g;{D9rG={74ALD2Y7B>%8&hh6R82cGJbB4- zoHi&*Bm2(D6Y2WMMew(tC>1WL`%;2&C$0L1AUdJ#IurAA$d?G7K(`&6Qw#Aw-KTwj^mJjyj+&r zluh|RS>p1VmoVJSC`Cy@j5V0JH%2vpBR%w|Te{KZSRzPLsxZPeh~^W==M;-@k8|0U zS#2!yS)azzm)O`yO_vaiddUDAs)D6H+Z>qKVNmx?R8jeea^wwMRpXxa zI2BIU$ZP9R<~uQ`b)~c=CubwuprS`5;>cAMA;<0_Dbl@#LHkY+c-lM3 zPJo{=lB<7|`c#3b>o(=XWP3<7A@_|36Z}nFa#)9M5$VQQ)^gBD)?=!z49QwgsTxaF zO%oQez;|l&v54+ik=<`o$RbkccZIgc{a)19j$&H)w(aA6IQ=Heqx%M$UJ`N8A_EehtEX$LpL=`X~dZu7y9Y!r!s}4+Ua_ae#G+k;Lw!(8pKp2nm7&V+&ZpScJ7&(VL2;9 zlscHwz`Bn#Qj;A?7^!7R%TdbJgvlh8);K#{H>-F%bYY3-5~L9jjzaGQ`U(B(rIO30 UkiO}Mgj(jC3=vuuns&qZ55{kp8~^|S delta 7 OcmZo@U~EuWAOHXgJOWYx diff --git a/egs/librispeech/ASR/conformer_ctc2/conformer.py b/egs/librispeech/ASR/conformer_ctc2/conformer.py index 56f969bbf..f04af5ae4 100644 --- a/egs/librispeech/ASR/conformer_ctc2/conformer.py +++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py @@ -31,7 +31,7 @@ from scaling import ( ) from subsampling import Conv2dSubsampling from torch import Tensor, nn -from transformer import Supervisions, Transformer, encoder_padding_mask +from transformer import Supervisions, Transformer, encoder_padding_mask, TransformerEncoder, TransformerEncoder class Conformer(Transformer): @@ -161,133 +161,6 @@ class Conformer(Transformer): return x, mask -class TransfEnc(Transformer): - """ - Args: - num_features (int): Number of input features - num_classes (int): Number of output classes - subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) - d_model (int): attention dimension, also the output dimension - nhead (int): number of head - dim_feedforward (int): feedforward dimention - num_encoder_layers (int): number of encoder layers - num_decoder_layers (int): number of decoder layers - dropout (float): dropout rate - layer_dropout (float): layer-dropout rate. - cnn_module_kernel (int): Kernel size of convolution module - vgg_frontend (bool): whether to use vgg frontend. - """ - - def __init__( - self, - num_features: int, - num_classes: int, - subsampling_factor: int = 4, - d_model: int = 256, - nhead: int = 4, - dim_feedforward: int = 2048, - num_encoder_layers: int = 12, - num_decoder_layers: int = 6, - dropout: float = 0.1, - layer_dropout: float = 0.075, - cnn_module_kernel: int = 31, - group_num: int = 0, - ) -> None: - super(TransfEnc, self).__init__( - num_features=num_features, - num_classes=num_classes, - subsampling_factor=subsampling_factor, - d_model=d_model, - nhead=nhead, - dim_feedforward=dim_feedforward, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - dropout=dropout, - layer_dropout=layer_dropout, - ) - - self.num_features = num_features - self.subsampling_factor = subsampling_factor - if subsampling_factor != 4: - raise NotImplementedError("Support only 'subsampling_factor=4'.") - - # self.encoder_embed converts the input of shape (N, T, num_features) - # to the shape (N, T//subsampling_factor, d_model). - # That is, it does two things simultaneously: - # (1) subsampling: T -> T//subsampling_factor - # (2) embedding: num_features -> d_model - self.encoder_embed = Conv2dSubsampling(num_features, d_model) - - self.encoder_pos = RelPositionalEncoding(d_model, dropout) - - encoder_layer = ConformerEncoderLayer( - d_model, - nhead, - dim_feedforward, - dropout, - layer_dropout, - cnn_module_kernel, - ) - self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers) - - self.group_num = group_num - if self.group_num != 0: - self.group_layer_num = int(num_encoder_layers // self.group_num) - self.alpha = nn.Parameter(torch.rand(self.group_num)) - self.sigmoid = nn.Sigmoid() - self.layer_norm = nn.LayerNorm(d_model) - - def run_encoder( - self, - x: torch.Tensor, - supervisions: Optional[Supervisions] = None, - warmup: float = 1.0, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - """ - Args: - x: - The input tensor. Its shape is (batch_size, seq_len, feature_dim). - supervisions: - Supervision in lhotse format. - See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32 # noqa - CAUTION: It contains length information, i.e., start and number of - frames, before subsampling - It is read directly from the batch, without any sorting. It is used - to compute encoder padding mask, which is used as memory key padding - mask for the decoder. - warmup: - A floating point value that gradually increases from 0 throughout - training; when it is >= 1.0 we are "fully warmed up". It is used - to turn modules on sequentially. - Returns: - Tensor: Predictor tensor of dimension (input_length, batch_size, d_model). - Tensor: Mask tensor of dimension (batch_size, input_length) - """ - x = self.encoder_embed(x) - x, pos_emb = self.encoder_pos(x) - x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) - mask = encoder_padding_mask(x.size(0), supervisions) - if mask is not None: - mask = mask.to(x.device) - - # Caution: We assume the subsampling factor is 4! - - x, layer_outputs = self.encoder( - x, pos_emb, src_key_padding_mask=mask, warmup=warmup - ) # (T, N, C) - - if self.group_num != 0: - x = 0 - for enum, alpha in enumerate(self.alpha): - x += self.sigmoid(alpha) * layer_outputs[(enum+1)*self.group_layer_num-1] - x = self.layer_norm(x/self.group_num) - # x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) - - - # return x, lengths - return x, mask - - class ConformerEncoderLayer(nn.Module): """ ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.