diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py index 46993da48..327cba2d3 100644 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py @@ -1141,8 +1141,8 @@ class EmformerEncoderLayer(nn.Module): - output utterance, with shape (U, B, D); - output right_context, with shape (R, B, D); - output memory, with shape (1, B, D) or (0, B, D). - - output state. - - updated conv_cache. + - updated attention cache. + - updated convolution cache. """ R = right_context.size(0) src = torch.cat([right_context, utterance]) @@ -1525,7 +1525,6 @@ class EmformerEncoder(nn.Module): right_context at the end. states (List[torch.Tensor, List[List[torch.Tensor]], List[torch.Tensor]]: # noqa Cached states containing: - - past_lens: number of past frames for each sample in batch - attn_caches: attention states from preceding chunk's computation, where each element corresponds to each emformer layer - conv_caches: left context for causal convolution, where each