add train.py, model.py

2025-12-11 06:55:27 +00:00 · 2023-01-15 17:22:28 +08:00 · 2023-01-15 17:22:28 +08:00 · 2fc7535de9
commit 2fc7535de9
parent c87f55671a
3 changed files with 1375 additions and 12 deletions
--- a/egs/librispeech/ASR/zipformer_ctc_attn/attention_decoder.py
+++ b/egs/librispeech/ASR/zipformer_ctc_attn/attention_decoder.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright    2022  Xiaomi Corp.        (authors: Zengwei Yao)
+# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey, Zengwei Yao)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -47,8 +47,7 @@ class AttentionDecoderModel(nn.Module):
    """
    Args:
        vocab_size (int): Number of classes.
-        encoder_dim (int):
-        d_model: (int,int): embedding dimension of 2 encoder stacks
+        decoder_dim: (int,int): embedding dimension of 2 encoder stacks
        attention_dim: (int,int): attention dimension of 2 encoder stacks
        nhead (int, int): number of heads
        dim_feedforward (int, int): feedforward dimension in 2 encoder stacks
@ -62,15 +61,15 @@ class AttentionDecoderModel(nn.Module):
    def __init__(
        self,
        vocab_size: int,
-        d_model: int,
+        decoder_dim: int,
        unmasked_dim: int,
        num_decoder_layers: int,
        attention_dim: int,
        nhead: int,
        feedforward_dim: int,
-        dropout: float,
        sos_id: int,
        eos_id: int,
+        dropout: float = 0.1,
        ignore_id: int = -1,
        warmup_batches: float = 4000.0,
        label_smoothing: float = 0.1,
@ -84,7 +83,7 @@ class AttentionDecoderModel(nn.Module):
        # layer learn something.  Then we start to warm up the other encoders.
        self.decoder = TransformerDecoder(
            vocab_size,
-            d_model,
+            decoder_dim,
            unmasked_dim,
            num_decoder_layers,
            attention_dim,
@ -103,7 +102,6 @@ class AttentionDecoderModel(nn.Module):
    def _pre_ys_in_out(self, token_ids: List[List[int]], device: torch.device):
        """Prepare ys_in_pad and ys_out_pad."""
        ys = k2.RaggedTensor(token_ids).to(device=device)
-
        row_splits = ys.shape.row_splits(1)
        ys_lens = row_splits[1:] - row_splits[:-1]

@ -168,10 +166,9 @@ class AttentionDecoderModel(nn.Module):
            decoder_out.view(-1, num_classes),
            ys_out_pad.view(-1),
            ignore_index=self.ignore_id,
-            reduction="None",
+            reduction="none",
        )
-        nll = nll.view(batch_size, -1)
-        nll = nll.sum(1)
+        nll = nll.view(batch_size, -1).sum(1)
        return nll


@ -181,7 +178,7 @@ class TransformerDecoder(nn.Module):

    Args:
        vocab_size: output dim
-        d_model: equal to encoder_dim
+        d_model: decoder dimension
        num_decoder_layers: number of decoder layers
        attention_dim: total dimension of multi head attention
        n_head: number of attention heads
@ -715,7 +712,7 @@ def subsequent_mask(size, device="cpu", dtype=torch.bool):
 def _test_attention_decoder_model():
    m = AttentionDecoderModel(
        vocab_size=500,
-        d_model=384,
+        decoder_dim=384,
        unmasked_dim=256,
        num_decoder_layers=6,
        attention_dim=192,
@ -733,6 +730,9 @@ def _test_attention_decoder_model():
    loss = m.calc_att_loss(encoder_out, encoder_out_lens, token_ids)
    print(loss)

+    nll = m.nll(encoder_out, encoder_out_lens, token_ids)
+    print(nll)
+

 if __name__ == "__main__":
    _test_attention_decoder_model()
--- a/egs/librispeech/ASR/zipformer_ctc_attn/model.py
+++ b/egs/librispeech/ASR/zipformer_ctc_attn/model.py
@ -0,0 +1,95 @@
+# Copyright    2023  Xiaomi Corp.        (authors: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from encoder_interface import EncoderInterface
+
+
+class CTCAttentionModel(nn.Module):
+    """Hybrid CTC & Attention decoder model."""
+
+    def __init__(
+        self,
+        encoder: EncoderInterface,
+        decoder: nn.Module,
+        encoder_dim: int,
+        vocab_size: int,
+    ):
+        """
+        Args:
+          encoder:
+            It is the Zipformer encoder model. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
+            `logit_lens` of shape (N,).
+          decoder:
+            It is the attention decoder.
+          encoder_dim:
+            The embedding dimension of encoder.
+          vocab_size:
+            The vocabulary size.
+        """
+        super().__init__()
+        assert isinstance(encoder, EncoderInterface), type(encoder)
+
+        self.encoder = encoder
+        self.ctc_output = nn.Sequential(
+            nn.Dropout(p=0.1),
+            nn.Linear(encoder_dim, vocab_size),
+            nn.LogSoftmax(dim=-1),
+        )
+        # Attention decoder
+        self.decoder = decoder
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        token_ids: List[List[int]],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          token_ids:
+            A list of token id list.
+
+        Returns:
+          - ctc_output, ctc log-probs
+          - att_loss, attention decoder loss
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert x.size(0) == x_lens.size(0) == len(token_ids)
+
+        # encoder forward
+        encoder_out, x_lens = self.encoder(x, x_lens)
+        assert torch.all(x_lens > 0)
+
+        # compute ctc log-probs
+        ctc_output = self.ctc_output(encoder_out)
+
+        # compute attention decoder loss
+        att_loss = self.decoder.calc_att_loss(encoder_out, x_lens, token_ids)
+
+        return ctc_output, att_loss
--- a/egs/librispeech/ASR/zipformer_ctc_attn/train.py
+++ b/egs/librispeech/ASR/zipformer_ctc_attn/train.py