Refactor transformer.py

2025-08-09 10:02:22 +00:00 · 2021-08-02 23:48:26 +08:00 · 2021-08-02 23:48:26 +08:00 · f6091b10c0
commit f6091b10c0
parent 1fa30998da
10 changed files with 689 additions and 400 deletions
--- a/egs/librispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/conformer.py
@ -89,15 +89,21 @@ class Conformer(Transformer):
    ) -> Tuple[Tensor, Optional[Tensor]]:
        """
        Args:
-            x: Tensor of dimension (batch_size, num_features, input_length).
+          x:
-            supervisions : Supervison in lhotse format, i.e., batch['supervisions']
+            The model input. Its shape is [N, T, C].
          supervisions:
            Supervision in lhotse format.
            See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32  # noqa
            CAUTION: It contains length information, i.e., start and number of
            frames, before subsampling
            It is read directly from the batch, without any sorting. It is used
            to compute encoder padding mask, which is used as memory key padding
            mask for the decoder.
        Returns:
            Tensor: Predictor tensor of dimension (input_length, batch_size, d_model).
            Tensor: Mask tensor of dimension (batch_size, input_length)
        """
        x = x.permute(0, 2, 1)  # (B, F, T) -> (B, T, F)
        x = self.encoder_embed(x)
        x, pos_emb = self.encoder_pos(x)
        x = x.permute(1, 0, 2)  # (B, T, F) -> (T, B, F)
@ -796,8 +802,7 @@ class RelPositionMultiheadAttention(nn.Module):
                bsz, num_heads, tgt_len, src_len
            )
            attn_output_weights = attn_output_weights.masked_fill(
-                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"),
                float("-inf"),
            )
            attn_output_weights = attn_output_weights.view(
                bsz * num_heads, tgt_len, src_len
@ -867,12 +872,7 @@ class ConvolutionModule(nn.Module):
        )
        self.norm = nn.BatchNorm1d(channels)
        self.pointwise_conv2 = nn.Conv1d(
-            channels,
+            channels, channels, kernel_size=1, stride=1, padding=0, bias=bias,
            channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.activation = Swish()
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@ -147,15 +147,10 @@ def decode_one_batch(
    feature = feature.to(device)
    # at entry, feature is [N, T, C]
    feature = feature.permute(0, 2, 1)  # now feature is [N, C, T]
    supervisions = batch["supervisions"]
    nnet_output, memory, memory_key_padding_mask = model(feature, supervisions)
-    # nnet_output is [N, C, T]
+    # nnet_output is [N, T, C]
    nnet_output = nnet_output.permute(0, 2, 1)
    # now nnet_output is [N, T, C]
    supervision_segments = torch.stack(
        (
@ -227,6 +222,8 @@ def decode_one_batch(
            model=model,
            memory=memory,
            memory_key_padding_mask=memory_key_padding_mask,
            sos_id=lexicon.sos_id,
            eos_id=lexicon.eos_id,
        )
    else:
        assert False, f"Unsupported decoding method: {params.method}"
@ -468,5 +465,8 @@ def main():
    logging.info("Done!")
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/conformer_ctc/subsampling.py
+++ b/egs/librispeech/ASR/conformer_ctc/subsampling.py
@ -0,0 +1,144 @@
 import torch
 import torch.nn as nn
 class Conv2dSubsampling(nn.Module):
    """Convolutional 2D subsampling (to 1/4 length).
    Convert an input of shape [N, T, idim] to an output
    with shape [N, T', odim], where
    T' = ((T-1)//2 - 1)//2, which approximates T' == T//4
    It is based on
    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
    """
    def __init__(self, idim: int, odim: int) -> None:
        """
        Args:
          idim:
            Input dim. The input shape is [N, T, idim].
            Caution: It requires: T >=7, idim >=7
          odim:
            Output dim. The output shape is [N, ((T-1)//2 - 1)//2, odim]
        """
        assert idim >= 7
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                in_channels=1, out_channels=odim, kernel_size=3, stride=2
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=odim, out_channels=odim, kernel_size=3, stride=2
            ),
            nn.ReLU(),
        )
        self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Subsample x.
        Args:
          x:
            Its shape is [N, T, idim].
        Returns:
          Return a tensor of shape [N, ((T-1)//2 - 1)//2, odim]
        """
        # On entry, x is [N, T, idim]
        x = x.unsqueeze(1)  # [N, T, idim] -> [N, 1, T, idim] i.e., [N, C, H, W]
        x = self.conv(x)
        # Now x is of shape [N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2]
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        # Now x is of shape [N, ((T-1)//2 - 1))//2, odim]
        return x
 class VggSubsampling(nn.Module):
    """Trying to follow the setup described in the following paper:
    https://arxiv.org/pdf/1910.09799.pdf
    This paper is not 100% explicit so I am guessing to some extent,
    and trying to compare with other VGG implementations.
    Convert an input of shape [N, T, idim] to an output
    with shape [N, T', odim], where
    T' = ((T-1)//2 - 1)//2, which approximates T' = T//4
    """
    def __init__(self, idim: int, odim: int) -> None:
        """Construct a VggSubsampling object.
        This uses 2 VGG blocks with 2 Conv2d layers each,
        subsampling its input by a factor of 4 in the time dimensions.
        Args:
          idim:
            Input dim. The input shape is [N, T, idim].
            Caution: It requires: T >=7, idim >=7
          odim:
            Output dim. The output shape is [N, ((T-1)//2 - 1)//2, odim]
        """
        super().__init__()
        cur_channels = 1
        layers = []
        block_dims = [32, 64]
        # The decision to use padding=1 for the 1st convolution, then padding=0
        # for the 2nd and for the max-pooling, and ceil_mode=True, was driven by
        # a back-compatibility concern so that the number of frames at the
        # output would be equal to:
        #  (((T-1)//2)-1)//2.
        # We can consider changing this by using padding=1 on the
        # 2nd convolution, so the num-frames at the output would be T//4.
        for block_dim in block_dims:
            layers.append(
                torch.nn.Conv2d(
                    in_channels=cur_channels,
                    out_channels=block_dim,
                    kernel_size=3,
                    padding=1,
                    stride=1,
                )
            )
            layers.append(torch.nn.ReLU())
            layers.append(
                torch.nn.Conv2d(
                    in_channels=block_dim,
                    out_channels=block_dim,
                    kernel_size=3,
                    padding=0,
                    stride=1,
                )
            )
            layers.append(
                torch.nn.MaxPool2d(
                    kernel_size=2, stride=2, padding=0, ceil_mode=True
                )
            )
            cur_channels = block_dim
        self.layers = nn.Sequential(*layers)
        self.out = nn.Linear(
            block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Subsample x.
        Args:
          x:
            Its shape is [N, T, idim].
        Returns:
          Return a tensor of shape [N, ((T-1)//2 - 1)//2, odim]
        """
        x = x.unsqueeze(1)
        x = self.layers(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        return x
--- a/egs/librispeech/ASR/conformer_ctc/test_subsampling.py
+++ b/egs/librispeech/ASR/conformer_ctc/test_subsampling.py
@ -0,0 +1,33 @@
 #!/usr/bin/env python3
 from subsampling import Conv2dSubsampling
 from subsampling import VggSubsampling
 import torch
 def test_conv2d_subsampling():
    N = 3
    odim = 2
    for T in range(7, 19):
        for idim in range(7, 20):
            model = Conv2dSubsampling(idim=idim, odim=odim)
            x = torch.empty(N, T, idim)
            y = model(x)
            assert y.shape[0] == N
            assert y.shape[1] == ((T - 1) // 2 - 1) // 2
            assert y.shape[2] == odim
 def test_vgg_subsampling():
    N = 3
    odim = 2
    for T in range(7, 19):
        for idim in range(7, 20):
            model = VggSubsampling(idim=idim, odim=odim)
            x = torch.empty(N, T, idim)
            y = model(x)
            assert y.shape[0] == N
            assert y.shape[1] == ((T - 1) // 2 - 1) // 2
            assert y.shape[2] == odim
--- a/egs/librispeech/ASR/conformer_ctc/test_transformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/test_transformer.py
@ -0,0 +1,36 @@
 #!/usr/bin/env python3
 import torch
 from transformer import Transformer, encoder_padding_mask
 def test_encoder_padding_mask():
    supervisions = {
        "sequence_idx": torch.tensor([0, 1, 2]),
        "start_frame": torch.tensor([0, 0, 0]),
        "num_frames": torch.tensor([18, 7, 13]),
    }
    max_len = ((18 - 1) // 2 - 1) // 2
    mask = encoder_padding_mask(max_len, supervisions)
    expected_mask = torch.tensor(
        [
            [False, False, False],  # ((18 - 1)//2 - 1)//2 = 3,
            [False, True, True],  # ((7 - 1)//2 - 1)//2 = 1,
            [False, False, True],  # ((13 - 1)//2 - 1)//2 = 2,
        ]
    )
    assert torch.all(torch.eq(mask, expected_mask))
 def test_transformer():
    num_features = 40
    num_classes = 87
    model = Transformer(num_features=num_features, num_classes=num_classes)
    N = 31
    for T in range(7, 30):
        x = torch.rand(N, T, num_features)
        y, _, _ = model(x)
        assert y.shape == (N, (((T - 1) // 2) - 1) // 2, num_classes)
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@ -275,15 +275,13 @@ def compute_loss(
    device = graph_compiler.device
    feature = batch["inputs"]
    # at entry, feature is [N, T, C]
    feature = feature.permute(0, 2, 1)  # now feature is [N, C, T]
    assert feature.ndim == 3
    feature = feature.to(device)
    supervisions = batch["supervisions"]
    with torch.set_grad_enabled(is_training):
        nnet_output, encoder_memory, memory_mask = model(feature, supervisions)
-        # nnet_output is [N, C, T]
+        # nnet_output is [N, T, C]
        nnet_output = nnet_output.permute(0, 2, 1)  # [N, C, T] -> [N, T, C]
    # NOTE: We need `encode_supervisions` to sort sequences with
    # different duration in decreasing order, required by
@ -536,6 +534,22 @@ def train_one_epoch(
                f" best valid loss: {params.best_valid_loss:.4f} "
                f"best valid epoch: {params.best_valid_epoch}"
            )
            if tb_writer is not None:
                tb_writer.add_scalar(
                    "train/valid_ctc_loss",
                    params.valid_ctc_loss,
                    params.batch_idx_train,
                )
                tb_writer.add_scalar(
                    "train/valid_att_loss",
                    params.valid_att_loss,
                    params.batch_idx_train,
                )
                tb_writer.add_scalar(
                    "train/valid_loss",
                    params.valid_loss,
                    params.batch_idx_train,
                )
    params.train_loss = tot_loss / tot_frames
@ -675,5 +689,8 @@ def main():
        run(rank=0, world_size=1, args=args)
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/conformer_ctc/transformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/transformer.py
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -11,11 +11,18 @@ import logging
 import os
 from pathlib import Path
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or it wastes a lot of CPU and
 # slow things down.  Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_librispeech():
    src_dir = Path("data/manifests")
@ -46,8 +53,7 @@ def compute_fbank_librispeech():
                continue
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
-                recordings=m["recordings"],
+                recordings=m["recordings"], supervisions=m["supervisions"],
                supervisions=m["supervisions"],
            )
            if "train" in partition:
                cut_set = (
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -11,11 +11,18 @@ import logging
 import os
 from pathlib import Path
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or it wastes a lot of CPU and
 # slow things down.  Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_musan():
    src_dir = Path("data/manifests")
--- a/icefall/decode.py
+++ b/icefall/decode.py
@ -555,11 +555,14 @@ def rescore_with_attention_decoder(
    model: nn.Module,
    memory: torch.Tensor,
    memory_key_padding_mask: torch.Tensor,
    sos_id: int,
    eos_id: int,
 ) -> Dict[str, k2.Fsa]:
    """This function extracts n paths from the given lattice and uses
    an attention decoder to rescore them. The path with the highest
    score is used as the decoding output.
    Args:
      lattice:
        An FsaVec. It can be the return value of :func:`get_lattice`.
      num_paths:
@ -573,6 +576,10 @@ def rescore_with_attention_decoder(
        Its shape is `[T, N, C]`.
      memory_key_padding_mask:
        The padding mask for memory with shape [N, T].
      sos_id:
        The token ID for SOS.
      eos_id:
        The token ID for EOS.
    Returns:
      A dict of FsaVec, whose key contains a string
      ngram_lm_scale_attention_scale and the value is the
@ -661,7 +668,11 @@ def rescore_with_attention_decoder(
    # TODO: pass the sos_token_id and eos_token_id via function arguments
    nll = model.decoder_nll(
-        expanded_memory, expanded_memory_key_padding_mask, token_ids, 1, 1
+        memory=expanded_memory,
        memory_key_padding_mask=expanded_memory_key_padding_mask,
        token_ids=token_ids,
        sos_id=sos_id,
        eos_id=eos_id,
    )
    assert nll.ndim == 2
    assert nll.shape[0] == num_word_seqs