diff --git a/egs/librispeech/ASR/conformer_ctc/conformer.py b/egs/librispeech/ASR/conformer_ctc/conformer.py index efe3570cb..b19b94db1 100644 --- a/egs/librispeech/ASR/conformer_ctc/conformer.py +++ b/egs/librispeech/ASR/conformer_ctc/conformer.py @@ -98,7 +98,7 @@ class Conformer(Transformer): """ Args: x: - The model input. Its shape is [N, T, C]. + The model input. Its shape is (N, T, C). supervisions: Supervision in lhotse format. See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32 # noqa diff --git a/egs/librispeech/ASR/conformer_ctc/decode.py b/egs/librispeech/ASR/conformer_ctc/decode.py index 6b9ba3ca7..b5b41c82e 100755 --- a/egs/librispeech/ASR/conformer_ctc/decode.py +++ b/egs/librispeech/ASR/conformer_ctc/decode.py @@ -213,12 +213,12 @@ def decode_one_batch( feature = batch["inputs"] assert feature.ndim == 3 feature = feature.to(device) - # at entry, feature is [N, T, C] + # at entry, feature is (N, T, C) supervisions = batch["supervisions"] nnet_output, memory, memory_key_padding_mask = model(feature, supervisions) - # nnet_output is [N, T, C] + # nnet_output is (N, T, C) supervision_segments = torch.stack( ( diff --git a/egs/librispeech/ASR/conformer_ctc/subsampling.py b/egs/librispeech/ASR/conformer_ctc/subsampling.py index 720ed6c22..542fb0364 100644 --- a/egs/librispeech/ASR/conformer_ctc/subsampling.py +++ b/egs/librispeech/ASR/conformer_ctc/subsampling.py @@ -22,8 +22,8 @@ import torch.nn as nn class Conv2dSubsampling(nn.Module): """Convolutional 2D subsampling (to 1/4 length). - Convert an input of shape [N, T, idim] to an output - with shape [N, T', odim], where + Convert an input of shape (N, T, idim) to an output + with shape (N, T', odim), where T' = ((T-1)//2 - 1)//2, which approximates T' == T//4 It is based on @@ -34,10 +34,10 @@ class Conv2dSubsampling(nn.Module): """ Args: idim: - Input dim. The input shape is [N, T, idim]. + Input dim. The input shape is (N, T, idim). Caution: It requires: T >=7, idim >=7 odim: - Output dim. The output shape is [N, ((T-1)//2 - 1)//2, odim] + Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim) """ assert idim >= 7 super().__init__() @@ -58,18 +58,18 @@ class Conv2dSubsampling(nn.Module): Args: x: - Its shape is [N, T, idim]. + Its shape is (N, T, idim). Returns: - Return a tensor of shape [N, ((T-1)//2 - 1)//2, odim] + Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim) """ - # On entry, x is [N, T, idim] - x = x.unsqueeze(1) # [N, T, idim] -> [N, 1, T, idim] i.e., [N, C, H, W] + # On entry, x is (N, T, idim) + x = x.unsqueeze(1) # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W) x = self.conv(x) - # Now x is of shape [N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2] + # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2) b, c, t, f = x.size() x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - # Now x is of shape [N, ((T-1)//2 - 1))//2, odim] + # Now x is of shape (N, ((T-1)//2 - 1))//2, odim) return x @@ -80,8 +80,8 @@ class VggSubsampling(nn.Module): This paper is not 100% explicit so I am guessing to some extent, and trying to compare with other VGG implementations. - Convert an input of shape [N, T, idim] to an output - with shape [N, T', odim], where + Convert an input of shape (N, T, idim) to an output + with shape (N, T', odim), where T' = ((T-1)//2 - 1)//2, which approximates T' = T//4 """ @@ -93,10 +93,10 @@ class VggSubsampling(nn.Module): Args: idim: - Input dim. The input shape is [N, T, idim]. + Input dim. The input shape is (N, T, idim). Caution: It requires: T >=7, idim >=7 odim: - Output dim. The output shape is [N, ((T-1)//2 - 1)//2, odim] + Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim) """ super().__init__() @@ -149,10 +149,10 @@ class VggSubsampling(nn.Module): Args: x: - Its shape is [N, T, idim]. + Its shape is (N, T, idim). Returns: - Return a tensor of shape [N, ((T-1)//2 - 1)//2, odim] + Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim) """ x = x.unsqueeze(1) x = self.layers(x) diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index 298b74112..80b2d924a 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -310,14 +310,14 @@ def compute_loss( """ device = graph_compiler.device feature = batch["inputs"] - # at entry, feature is [N, T, C] + # at entry, feature is (N, T, C) assert feature.ndim == 3 feature = feature.to(device) supervisions = batch["supervisions"] with torch.set_grad_enabled(is_training): nnet_output, encoder_memory, memory_mask = model(feature, supervisions) - # nnet_output is [N, T, C] + # nnet_output is (N, T, C) # NOTE: We need `encode_supervisions` to sort sequences with # different duration in decreasing order, required by diff --git a/egs/librispeech/ASR/conformer_ctc/transformer.py b/egs/librispeech/ASR/conformer_ctc/transformer.py index 88b10b23d..68a4ff65c 100644 --- a/egs/librispeech/ASR/conformer_ctc/transformer.py +++ b/egs/librispeech/ASR/conformer_ctc/transformer.py @@ -83,8 +83,8 @@ class Transformer(nn.Module): if subsampling_factor != 4: raise NotImplementedError("Support only 'subsampling_factor=4'.") - # self.encoder_embed converts the input of shape [N, T, num_classes] - # to the shape [N, T//subsampling_factor, d_model]. + # self.encoder_embed converts the input of shape (N, T, num_classes) + # to the shape (N, T//subsampling_factor, d_model). # That is, it does two things simultaneously: # (1) subsampling: T -> T//subsampling_factor # (2) embedding: num_classes -> d_model @@ -162,7 +162,7 @@ class Transformer(nn.Module): """ Args: x: - The input tensor. Its shape is [N, T, C]. + The input tensor. Its shape is (N, T, C). supervision: Supervision in lhotse format. See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32 # noqa @@ -171,17 +171,17 @@ class Transformer(nn.Module): Returns: Return a tuple containing 3 tensors: - - CTC output for ctc decoding. Its shape is [N, T, C] - - Encoder output with shape [T, N, C]. It can be used as key and + - CTC output for ctc decoding. Its shape is (N, T, C) + - Encoder output with shape (T, N, C). It can be used as key and value for the decoder. - Encoder output padding mask. It can be used as - memory_key_padding_mask for the decoder. Its shape is [N, T]. + memory_key_padding_mask for the decoder. Its shape is (N, T). It is None if `supervision` is None. """ if self.use_feat_batchnorm: - x = x.permute(0, 2, 1) # [N, T, C] -> [N, C, T] + x = x.permute(0, 2, 1) # (N, T, C) -> (N, C, T) x = self.feat_batchnorm(x) - x = x.permute(0, 2, 1) # [N, C, T] -> [N, T, C] + x = x.permute(0, 2, 1) # (N, C, T) -> (N, T, C) encoder_memory, memory_key_padding_mask = self.run_encoder( x, supervision ) @@ -195,7 +195,7 @@ class Transformer(nn.Module): Args: x: - The model input. Its shape is [N, T, C]. + The model input. Its shape is (N, T, C). supervisions: Supervision in lhotse format. See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32 # noqa @@ -206,8 +206,8 @@ class Transformer(nn.Module): padding mask for the decoder. Returns: Return a tuple with two tensors: - - The encoder output, with shape [T, N, C] - - encoder padding mask, with shape [N, T]. + - The encoder output, with shape (T, N, C) + - encoder padding mask, with shape (N, T). The mask is None if `supervisions` is None. It is used as memory key padding mask in the decoder. """ @@ -225,11 +225,11 @@ class Transformer(nn.Module): Args: x: The output tensor from the transformer encoder. - Its shape is [T, N, C] + Its shape is (T, N, C) Returns: Return a tensor that can be used for CTC decoding. - Its shape is [N, T, C] + Its shape is (N, T, C) """ x = self.encoder_output_layer(x) x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) @@ -247,7 +247,7 @@ class Transformer(nn.Module): """ Args: memory: - It's the output of the encoder with shape [T, N, C] + It's the output of the encoder with shape (T, N, C) memory_key_padding_mask: The padding mask from the encoder. token_ids: @@ -312,7 +312,7 @@ class Transformer(nn.Module): """ Args: memory: - It's the output of the encoder with shape [T, N, C] + It's the output of the encoder with shape (T, N, C) memory_key_padding_mask: The padding mask from the encoder. token_ids: @@ -654,13 +654,13 @@ class PositionalEncoding(nn.Module): def extend_pe(self, x: torch.Tensor) -> None: """Extend the time t in the positional encoding if required. - The shape of `self.pe` is [1, T1, d_model]. The shape of the input x - is [N, T, d_model]. If T > T1, then we change the shape of self.pe - to [N, T, d_model]. Otherwise, nothing is done. + The shape of `self.pe` is (1, T1, d_model). The shape of the input x + is (N, T, d_model). If T > T1, then we change the shape of self.pe + to (N, T, d_model). Otherwise, nothing is done. Args: x: - It is a tensor of shape [N, T, C]. + It is a tensor of shape (N, T, C). Returns: Return None. """ @@ -678,7 +678,7 @@ class PositionalEncoding(nn.Module): pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) - # Now pe is of shape [1, T, d_model], where T is x.size(1) + # Now pe is of shape (1, T, d_model), where T is x.size(1) self.pe = pe.to(device=x.device, dtype=x.dtype) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -687,10 +687,10 @@ class PositionalEncoding(nn.Module): Args: x: - Its shape is [N, T, C] + Its shape is (N, T, C) Returns: - Return a tensor of shape [N, T, C] + Return a tensor of shape (N, T, C) """ self.extend_pe(x) x = x * self.xscale + self.pe[:, : x.size(1), :] diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py index 8524ab1b9..1e91b1008 100755 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py @@ -190,12 +190,12 @@ def decode_one_batch( feature = batch["inputs"] assert feature.ndim == 3 feature = feature.to(device) - # at entry, feature is [N, T, C] + # at entry, feature is (N, T, C) - feature = feature.permute(0, 2, 1) # now feature is [N, C, T] + feature = feature.permute(0, 2, 1) # now feature is (N, C, T) nnet_output = model(feature) - # nnet_output is [N, T, C] + # nnet_output is (N, T, C) supervisions = batch["supervisions"] diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/pretrained.py b/egs/librispeech/ASR/tdnn_lstm_ctc/pretrained.py index 4f82a989c..0a543d859 100755 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/pretrained.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/pretrained.py @@ -218,11 +218,11 @@ def main(): features = pad_sequence( features, batch_first=True, padding_value=math.log(1e-10) ) - features = features.permute(0, 2, 1) # now features is [N, C, T] + features = features.permute(0, 2, 1) # now features is (N, C, T) with torch.no_grad(): nnet_output = model(features) - # nnet_output is [N, T, C] + # nnet_output is (N, T, C) batch_size = nnet_output.shape[0] supervision_segments = torch.tensor( diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py index 4d45d197b..695ee5130 100755 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py @@ -290,14 +290,14 @@ def compute_loss( """ device = graph_compiler.device feature = batch["inputs"] - # at entry, feature is [N, T, C] - feature = feature.permute(0, 2, 1) # now feature is [N, C, T] + # at entry, feature is (N, T, C) + feature = feature.permute(0, 2, 1) # now feature is (N, C, T) assert feature.ndim == 3 feature = feature.to(device) with torch.set_grad_enabled(is_training): nnet_output = model(feature) - # nnet_output is [N, T, C] + # nnet_output is (N, T, C) # NOTE: We need `encode_supervisions` to sort sequences with # different duration in decreasing order, required by diff --git a/egs/yesno/ASR/tdnn/decode.py b/egs/yesno/ASR/tdnn/decode.py index 54fdbb3cc..325acf316 100755 --- a/egs/yesno/ASR/tdnn/decode.py +++ b/egs/yesno/ASR/tdnn/decode.py @@ -111,10 +111,10 @@ def decode_one_batch( feature = batch["inputs"] assert feature.ndim == 3 feature = feature.to(device) - # at entry, feature is [N, T, C] + # at entry, feature is (N, T, C) nnet_output = model(feature) - # nnet_output is [N, T, C] + # nnet_output is (N, T, C) batch_size = nnet_output.shape[0] supervision_segments = torch.tensor( diff --git a/egs/yesno/ASR/tdnn/train.py b/egs/yesno/ASR/tdnn/train.py index 39c5ef3ef..0f5506d38 100755 --- a/egs/yesno/ASR/tdnn/train.py +++ b/egs/yesno/ASR/tdnn/train.py @@ -268,13 +268,13 @@ def compute_loss( """ device = graph_compiler.device feature = batch["inputs"] - # at entry, feature is [N, T, C] + # at entry, feature is (N, T, C) assert feature.ndim == 3 feature = feature.to(device) with torch.set_grad_enabled(is_training): nnet_output = model(feature) - # nnet_output is [N, T, C] + # nnet_output is (N, T, C) # NOTE: We need `encode_supervisions` to sort sequences with # different duration in decreasing order, required by diff --git a/icefall/decode.py b/icefall/decode.py index 73e3b61f7..573c6bf78 100644 --- a/icefall/decode.py +++ b/icefall/decode.py @@ -78,7 +78,7 @@ def get_lattice( network output. Args: nnet_output: - It is the output of a neural model of shape `[N, T, C]`. + It is the output of a neural model of shape `(N, T, C)`. HLG: An Fsa, the decoding graph. See also `compile_HLG.py`. supervision_segments: @@ -108,10 +108,12 @@ def get_lattice( subsampling_factor: The subsampling factor of the model. Returns: - A lattice containing the decoding result. + An FsaVec containing the decoding result. It has axes [utt][state][arc]. """ dense_fsa_vec = k2.DenseFsaVec( - nnet_output, supervision_segments, allow_truncate=subsampling_factor - 1 + nnet_output, + supervision_segments, + allow_truncate=subsampling_factor - 1, ) lattice = k2.intersect_dense_pruned( @@ -138,6 +140,8 @@ def levenshtein_graph(symbol_ids: List[int]) -> k2.Fsa: Args: symbol_ids: A list of symbol IDs (excluding 0 and -1) + Returns: + Return an Fsa (with 2 axes [state][arc]). """ assert 0 not in symbol_ids assert -1 not in symbol_ids