mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 18:54:18 +00:00
refactor attention decoder
This commit is contained in:
parent
1503351833
commit
0be32f3da0
@ -36,7 +36,7 @@ class AttentionDecoderModel(nn.Module):
|
|||||||
vocab_size (int): Number of classes.
|
vocab_size (int): Number of classes.
|
||||||
decoder_dim: (int,int): embedding dimension of 2 encoder stacks
|
decoder_dim: (int,int): embedding dimension of 2 encoder stacks
|
||||||
attention_dim: (int,int): attention dimension of 2 encoder stacks
|
attention_dim: (int,int): attention dimension of 2 encoder stacks
|
||||||
nhead (int, int): number of heads
|
num_heads (int, int): number of heads
|
||||||
dim_feedforward (int, int): feedforward dimension in 2 encoder stacks
|
dim_feedforward (int, int): feedforward dimension in 2 encoder stacks
|
||||||
num_encoder_layers (int): number of encoder layers
|
num_encoder_layers (int): number of encoder layers
|
||||||
dropout (float): dropout rate
|
dropout (float): dropout rate
|
||||||
@ -48,7 +48,7 @@ class AttentionDecoderModel(nn.Module):
|
|||||||
decoder_dim: int = 512,
|
decoder_dim: int = 512,
|
||||||
num_decoder_layers: int = 6,
|
num_decoder_layers: int = 6,
|
||||||
attention_dim: int = 512,
|
attention_dim: int = 512,
|
||||||
nhead: int = 8,
|
num_heads: int = 8,
|
||||||
feedforward_dim: int = 2048,
|
feedforward_dim: int = 2048,
|
||||||
memory_dim: int = 512,
|
memory_dim: int = 512,
|
||||||
sos_id: int = 1,
|
sos_id: int = 1,
|
||||||
@ -69,7 +69,7 @@ class AttentionDecoderModel(nn.Module):
|
|||||||
d_model=decoder_dim,
|
d_model=decoder_dim,
|
||||||
num_decoder_layers=num_decoder_layers,
|
num_decoder_layers=num_decoder_layers,
|
||||||
attention_dim=attention_dim,
|
attention_dim=attention_dim,
|
||||||
nhead=nhead,
|
num_heads=num_heads,
|
||||||
feedforward_dim=feedforward_dim,
|
feedforward_dim=feedforward_dim,
|
||||||
memory_dim=memory_dim,
|
memory_dim=memory_dim,
|
||||||
dropout=dropout,
|
dropout=dropout,
|
||||||
@ -111,7 +111,12 @@ class AttentionDecoderModel(nn.Module):
|
|||||||
ys_in_pad, ys_in_lens, ys_out_pad = self._pre_ys_in_out(ys, ys_lens)
|
ys_in_pad, ys_in_lens, ys_out_pad = self._pre_ys_in_out(ys, ys_lens)
|
||||||
|
|
||||||
# decoder forward
|
# decoder forward
|
||||||
decoder_out = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens)
|
decoder_out = self.decoder(
|
||||||
|
x=ys_in_pad,
|
||||||
|
x_lens=ys_in_lens,
|
||||||
|
memory=encoder_out,
|
||||||
|
memory_lens=encoder_out_lens,
|
||||||
|
)
|
||||||
|
|
||||||
loss = self.loss_fun(x=decoder_out, target=ys_out_pad)
|
loss = self.loss_fun(x=decoder_out, target=ys_out_pad)
|
||||||
return loss
|
return loss
|
||||||
@ -137,7 +142,12 @@ class AttentionDecoderModel(nn.Module):
|
|||||||
ys_in_pad, ys_in_lens, ys_out_pad = self._pre_ys_in_out(ys, ys_lens)
|
ys_in_pad, ys_in_lens, ys_out_pad = self._pre_ys_in_out(ys, ys_lens)
|
||||||
|
|
||||||
# decoder forward
|
# decoder forward
|
||||||
decoder_out = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens)
|
decoder_out = self.decoder(
|
||||||
|
x=ys_in_pad,
|
||||||
|
x_lens=ys_in_lens,
|
||||||
|
memory=encoder_out,
|
||||||
|
memory_lens=encoder_out_lens,
|
||||||
|
)
|
||||||
|
|
||||||
batch_size, _, num_classes = decoder_out.size()
|
batch_size, _, num_classes = decoder_out.size()
|
||||||
nll = nn.functional.cross_entropy(
|
nll = nn.functional.cross_entropy(
|
||||||
@ -152,14 +162,13 @@ class AttentionDecoderModel(nn.Module):
|
|||||||
|
|
||||||
class TransformerDecoder(nn.Module):
|
class TransformerDecoder(nn.Module):
|
||||||
"""Transfomer decoder module.
|
"""Transfomer decoder module.
|
||||||
It is modified from https://github.com/espnet/espnet/blob/master/espnet2/asr/decoder/transformer_decoder.py.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size: output dim
|
vocab_size: output dim
|
||||||
d_model: decoder dimension
|
d_model: decoder dimension
|
||||||
num_decoder_layers: number of decoder layers
|
num_decoder_layers: number of decoder layers
|
||||||
attention_dim: total dimension of multi head attention
|
attention_dim: total dimension of multi head attention
|
||||||
n_head: number of attention heads
|
num_heads: number of attention heads
|
||||||
feedforward_dim: hidden dimension of feed_forward module
|
feedforward_dim: hidden dimension of feed_forward module
|
||||||
dropout: dropout rate
|
dropout: dropout rate
|
||||||
"""
|
"""
|
||||||
@ -170,7 +179,7 @@ class TransformerDecoder(nn.Module):
|
|||||||
d_model: int = 512,
|
d_model: int = 512,
|
||||||
num_decoder_layers: int = 6,
|
num_decoder_layers: int = 6,
|
||||||
attention_dim: int = 512,
|
attention_dim: int = 512,
|
||||||
nhead: int = 8,
|
num_heads: int = 8,
|
||||||
feedforward_dim: int = 2048,
|
feedforward_dim: int = 2048,
|
||||||
memory_dim: int = 512,
|
memory_dim: int = 512,
|
||||||
dropout: float = 0.1,
|
dropout: float = 0.1,
|
||||||
@ -178,14 +187,19 @@ class TransformerDecoder(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
|
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
|
||||||
|
|
||||||
# Using absolute positional encoding
|
# Absolute positional encoding
|
||||||
self.pos = PositionalEncoding(d_model, dropout_rate=0.1)
|
self.pos = PositionalEncoding(d_model, dropout_rate=0.1)
|
||||||
|
|
||||||
self.num_layers = num_decoder_layers
|
self.num_layers = num_decoder_layers
|
||||||
self.layers = nn.ModuleList(
|
self.layers = nn.ModuleList(
|
||||||
[
|
[
|
||||||
DecoderLayer(
|
DecoderLayer(
|
||||||
d_model, attention_dim, nhead, feedforward_dim, memory_dim, dropout
|
d_model=d_model,
|
||||||
|
attention_dim=attention_dim,
|
||||||
|
num_heads=num_heads,
|
||||||
|
feedforward_dim=feedforward_dim,
|
||||||
|
memory_dim=memory_dim,
|
||||||
|
dropout=dropout,
|
||||||
)
|
)
|
||||||
for _ in range(num_decoder_layers)
|
for _ in range(num_decoder_layers)
|
||||||
]
|
]
|
||||||
@ -195,49 +209,67 @@ class TransformerDecoder(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
memory: torch.Tensor,
|
x: torch.Tensor,
|
||||||
memory_lens: torch.Tensor,
|
x_lens: torch.Tensor,
|
||||||
ys_in_pad: torch.Tensor,
|
memory: Optional[torch.Tensor] = None,
|
||||||
ys_in_lens: torch.Tensor,
|
memory_lens: Optional[torch.Tensor] = None,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> torch.Tensor:
|
||||||
"""Forward decoder.
|
"""
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
memory: encoded memory, (batch, maxlen_in, feat)
|
x: Input tensor of shape (batch, tgt_len).
|
||||||
memory_lens: (batch,)
|
x_lens: A tensor of shape (batch,) containing the number of tokens in `x`
|
||||||
ys_in_pad: input token ids, (batch, maxlen_out)
|
before padding.
|
||||||
ys_in_lens: (batch, )
|
memory:
|
||||||
|
Memory sequence of shape (batch, src_len, memory_dim).
|
||||||
|
memory_lens:
|
||||||
|
A tensor of shape (batch,) containing the number of frames in
|
||||||
|
`memory` before padding.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tgt: decoded token score before softmax (batch, maxlen_out, vocab_size)
|
Decoded token logits before softmax (batch, tgt_len, vocab_size)
|
||||||
"""
|
"""
|
||||||
tgt = ys_in_pad
|
x = self.embed(x) # (batch, tgt_len, embed_dim)
|
||||||
# tgt_mask: (B, 1, L)
|
x = self.pos(x) # (batch, tgt_len, embed_dim)
|
||||||
tgt_mask = make_pad_mask(ys_in_lens)[:, None, :].to(tgt.device)
|
|
||||||
# m: (1, L, L)
|
|
||||||
m = subsequent_mask(tgt_mask.size(-1), device=tgt_mask.device).unsqueeze(0)
|
|
||||||
# tgt_mask: (B, L, L)
|
|
||||||
tgt_mask = tgt_mask | (~m)
|
|
||||||
|
|
||||||
memory_mask = make_pad_mask(memory_lens)[:, None, :].to(memory.device)
|
x = x.permute(1, 0, 2) # (tgt_len, batch, embed_dim)
|
||||||
|
|
||||||
tgt = self.embed(tgt)
|
# construct attn_mask for self-attn modules
|
||||||
tgt = self.pos(tgt)
|
padding_mask = make_pad_mask(x_lens) # (batch, tgt_len)
|
||||||
|
causal_mask = subsequent_mask(x.shape[0], device=x.device) # (seq_len, seq_len)
|
||||||
|
attn_mask = torch.logical_or(
|
||||||
|
padding_mask.unsqueeze(1), # (batch, 1, seq_len)
|
||||||
|
torch.logical_not(causal_mask).unsqueeze(0) # (1, seq_len, seq_len)
|
||||||
|
) # (batch, seq_len, seq_len)
|
||||||
|
|
||||||
|
if memory is not None:
|
||||||
|
memory = memory.permute(1, 0, 2) # (src_len, batch, memory_dim)
|
||||||
|
# construct memory_attn_mask for cross-attn modules
|
||||||
|
memory_padding_mask = make_pad_mask(memory_lens) # (batch, src_len)
|
||||||
|
memory_attn_mask = memory_padding_mask.unsqueeze(1) # (batch, 1, src_len)
|
||||||
|
else:
|
||||||
|
memory_attn_mask = None
|
||||||
|
|
||||||
for i, mod in enumerate(self.layers):
|
for i, mod in enumerate(self.layers):
|
||||||
tgt = mod(tgt, tgt_mask, memory, memory_mask)
|
x = mod(
|
||||||
|
x,
|
||||||
|
attn_mask=attn_mask,
|
||||||
|
memory=memory,
|
||||||
|
memory_attn_mask=memory_attn_mask,
|
||||||
|
)
|
||||||
|
|
||||||
tgt = self.output_layer(tgt)
|
x = x.permute(1, 0, 2) # (batch, tgt_len, vocab_size)
|
||||||
return tgt
|
x = self.output_layer(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
class DecoderLayer(nn.Module):
|
class DecoderLayer(nn.Module):
|
||||||
"""Single decoder layer module.
|
"""Single decoder layer module.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
d_model: equal to encoder_dim
|
d_model: equal to decoder_dim, total dimension of the decoder
|
||||||
attention_dim: total dimension of multi head attention
|
attention_dim: total dimension of multi head attention
|
||||||
n_head: number of attention heads
|
num_heads: number of attention heads
|
||||||
feedforward_dim: hidden dimension of feed_forward module
|
feedforward_dim: hidden dimension of feed_forward module
|
||||||
dropout: dropout rate
|
dropout: dropout rate
|
||||||
"""
|
"""
|
||||||
@ -246,7 +278,7 @@ class DecoderLayer(nn.Module):
|
|||||||
self,
|
self,
|
||||||
d_model: int = 512,
|
d_model: int = 512,
|
||||||
attention_dim: int = 512,
|
attention_dim: int = 512,
|
||||||
nhead: int = 8,
|
num_heads: int = 8,
|
||||||
feedforward_dim: int = 2048,
|
feedforward_dim: int = 2048,
|
||||||
memory_dim: int = 512,
|
memory_dim: int = 512,
|
||||||
dropout: float = 0.1,
|
dropout: float = 0.1,
|
||||||
@ -255,10 +287,14 @@ class DecoderLayer(nn.Module):
|
|||||||
super(DecoderLayer, self).__init__()
|
super(DecoderLayer, self).__init__()
|
||||||
|
|
||||||
self.norm_self_attn = nn.LayerNorm(d_model)
|
self.norm_self_attn = nn.LayerNorm(d_model)
|
||||||
self.self_attn = MultiHeadedAttention(d_model, attention_dim, nhead, dropout=0.0)
|
self.self_attn = MultiHeadAttention(
|
||||||
|
d_model, attention_dim, num_heads, dropout=0.0
|
||||||
|
)
|
||||||
|
|
||||||
self.norm_src_attn = nn.LayerNorm(d_model)
|
self.norm_src_attn = nn.LayerNorm(d_model)
|
||||||
self.src_attn = MultiHeadedAttention(d_model, attention_dim, nhead, memory_dim=memory_dim, dropout=0.0)
|
self.src_attn = MultiHeadAttention(
|
||||||
|
d_model, attention_dim, num_heads, memory_dim=memory_dim, dropout=0.0
|
||||||
|
)
|
||||||
|
|
||||||
self.norm_ff = nn.LayerNorm(d_model)
|
self.norm_ff = nn.LayerNorm(d_model)
|
||||||
self.feed_forward = nn.Sequential(
|
self.feed_forward = nn.Sequential(
|
||||||
@ -270,40 +306,53 @@ class DecoderLayer(nn.Module):
|
|||||||
|
|
||||||
self.dropout = nn.Dropout(dropout)
|
self.dropout = nn.Dropout(dropout)
|
||||||
|
|
||||||
def forward(self, tgt, tgt_mask, memory, memory_mask):
|
def forward(
|
||||||
"""Compute decoded features.
|
self,
|
||||||
|
x: torch.Tensor,
|
||||||
|
attn_mask: Optional[torch.Tensor] = None,
|
||||||
|
memory: Optional[torch.Tensor] = None,
|
||||||
|
memory_attn_mask: Optional[torch.Tensor] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""
|
||||||
Args:
|
Args:
|
||||||
tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
|
x: Input sequence of shape (seq_len, batch, embed_dim).
|
||||||
tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
|
attn_mask: A binary mask for self-attention module indicating which
|
||||||
memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
|
elements will be filled with -inf.
|
||||||
memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
|
Its shape is (batch, 1, src_len) or (batch, tgt_len, src_len).
|
||||||
|
memory: Memory sequence of shape (seq_len, batch, memory_dim).
|
||||||
Returns:
|
memory_attn_mask: A binary mask for cross-attention module indicating which
|
||||||
torch.Tensor: Output tensor(#batch, maxlen_out, size).
|
elements will be filled with -inf.
|
||||||
|
Its shape is (batch, 1, src_len) or (batch, tgt_len, src_len).
|
||||||
"""
|
"""
|
||||||
# self-attn module
|
# self-attn module
|
||||||
tgt_norm = self.norm_self_attn(tgt)
|
qkv = self.norm_self_attn(x)
|
||||||
tgt = tgt + self.dropout(self.self_attn(tgt_norm, tgt_norm, tgt_norm, tgt_mask))
|
self_attn_out = self.self_attn(
|
||||||
|
query=qkv, key=qkv, value=qkv, attn_mask=attn_mask
|
||||||
|
)
|
||||||
|
x = x + self.dropout(self_attn_out)
|
||||||
|
|
||||||
# cross-attn module
|
# cross-attn module
|
||||||
tgt = tgt + self.dropout(self.src_attn(self.norm_src_attn(tgt), memory, memory, memory_mask))
|
q = self.norm_src_attn(x)
|
||||||
|
src_attn_out = self.src_attn(
|
||||||
|
query=q, key=memory, value=memory, attn_mask=memory_attn_mask
|
||||||
|
)
|
||||||
|
x = x + self.dropout(src_attn_out)
|
||||||
|
|
||||||
# feed-forward module
|
# feed-forward module
|
||||||
tgt = tgt + self.dropout(self.feed_forward(self.norm_ff(tgt)))
|
x = x + self.dropout(self.feed_forward(self.norm_ff(x)))
|
||||||
|
|
||||||
return tgt
|
return x
|
||||||
|
|
||||||
|
|
||||||
class MultiHeadedAttention(nn.Module):
|
class MultiHeadAttention(nn.Module):
|
||||||
"""Multi-Head Attention layer.
|
"""Multi-Head Attention layer.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
embed_dim: total dimension of the model.
|
embed_dim: total dimension of the model.
|
||||||
attention_dim: dimension in the attention module, may be less or more than embed_dim
|
attention_dim: dimension in the attention module, but must be a multiple of num_heads.
|
||||||
but must be a multiple of num_heads.
|
num_heads: number of parallel attention heads.
|
||||||
num_heads: parallel attention heads.
|
memory_dim: dimension of memory embedding, optional.
|
||||||
dropout: a Dropout layer on attn_output_weights. Default: 0.0.
|
dropout: a Dropout layer on attn_output_weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -312,20 +361,18 @@ class MultiHeadedAttention(nn.Module):
|
|||||||
attention_dim: int,
|
attention_dim: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
memory_dim: Optional[int] = None,
|
memory_dim: Optional[int] = None,
|
||||||
dropout: float = 0.0
|
dropout: float = 0.0,
|
||||||
):
|
):
|
||||||
"""Construct an MultiHeadedAttention object."""
|
super(MultiHeadAttention, self).__init__()
|
||||||
super(MultiHeadedAttention, self).__init__()
|
|
||||||
self.embed_dim = embed_dim
|
self.embed_dim = embed_dim
|
||||||
self.attention_dim = attention_dim
|
self.attention_dim = attention_dim
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.dropout = dropout
|
|
||||||
self.head_dim = attention_dim // num_heads
|
self.head_dim = attention_dim // num_heads
|
||||||
assert self.head_dim * num_heads == attention_dim, (
|
assert self.head_dim * num_heads == attention_dim, (
|
||||||
self.head_dim,
|
self.head_dim, num_heads, attention_dim
|
||||||
num_heads,
|
|
||||||
attention_dim,
|
|
||||||
)
|
)
|
||||||
|
self.dropout = dropout
|
||||||
|
self.name = None # will be overwritten in training code; for diagnostics.
|
||||||
|
|
||||||
self.linear_q = nn.Linear(embed_dim, attention_dim, bias=True)
|
self.linear_q = nn.Linear(embed_dim, attention_dim, bias=True)
|
||||||
self.linear_k = nn.Linear(
|
self.linear_k = nn.Linear(
|
||||||
@ -334,74 +381,89 @@ class MultiHeadedAttention(nn.Module):
|
|||||||
self.linear_v = nn.Linear(
|
self.linear_v = nn.Linear(
|
||||||
embed_dim if memory_dim is None else memory_dim, attention_dim, bias=True
|
embed_dim if memory_dim is None else memory_dim, attention_dim, bias=True
|
||||||
)
|
)
|
||||||
self.scale = math.sqrt(self.head_dim)
|
|
||||||
|
|
||||||
self.out_proj = nn.Linear(attention_dim, embed_dim, bias=True)
|
self.out_proj = nn.Linear(attention_dim, embed_dim, bias=True)
|
||||||
|
|
||||||
def forward(self, query, key, value, mask):
|
def forward(
|
||||||
"""Compute scaled dot product attention.
|
self,
|
||||||
|
query: torch.Tensor,
|
||||||
|
key: torch.Tensor,
|
||||||
|
value: torch.Tensor,
|
||||||
|
key_padding_mask: Optional[torch.Tensor] = None,
|
||||||
|
attn_mask: Optional[torch.Tensor] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""Compute dot product attention.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query (torch.Tensor): Query tensor (#batch, time1, size).
|
query: Query tensor of shape (tgt_len, batch, embed_dim).
|
||||||
key (torch.Tensor): Key tensor (#batch, time2, size).
|
key: Key tensor of shape (src_len, batch, embed_dim or memory_dim).
|
||||||
value (torch.Tensor): Value tensor (#batch, time2, size).
|
value: Value tensor of shape (src_len, batch, embed_dim or memory_dim).
|
||||||
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
|
key_padding_mask: A binary mask indicating which elements are padding.
|
||||||
(#batch, time1, time2).
|
Its shape is (batch, src_len).
|
||||||
|
attn_mask: A binary mask indicating which elements will be filled with -inf.
|
||||||
|
Its shape is (batch, 1, src_len) or (batch, tgt_len, src_len).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.Tensor: Output tensor (#batch, time1, d_model).
|
Output tensor of shape (tgt_len, batch, embed_dim).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
bsz, tgt_len, _ = query.size()
|
|
||||||
src_len = key.size(1)
|
|
||||||
num_heads = self.num_heads
|
num_heads = self.num_heads
|
||||||
head_dim = self.head_dim
|
head_dim = self.head_dim
|
||||||
|
|
||||||
q = self.linear_q(query)
|
tgt_len, batch, _ = query.shape
|
||||||
k = self.linear_k(key)
|
src_len = key.shape[0]
|
||||||
v = self.linear_v(value)
|
|
||||||
|
|
||||||
q = q.reshape(bsz, tgt_len, num_heads, head_dim)
|
q = self.linear_q(query) # (tgt_len, batch, num_heads * head_dim)
|
||||||
q = q.transpose(1, 2) # (batch, head, time1, head_dim)
|
k = self.linear_k(key) # (src_len, batch, num_heads * head_dim)
|
||||||
k = k.reshape(bsz, src_len, num_heads, head_dim)
|
v = self.linear_v(value) # (src_len, batch, num_heads * head_dim)
|
||||||
k = k.permute(0, 2, 3, 1) # (batch, head, head_dim, time2)
|
|
||||||
v = v.reshape(bsz, src_len, num_heads, head_dim)
|
|
||||||
v = v.transpose(1, 2).reshape(bsz * num_heads, src_len, head_dim)
|
|
||||||
|
|
||||||
# (batch, head, time1, time2)
|
q = q.reshape(tgt_len, batch, num_heads, head_dim)
|
||||||
attn_output_weights = torch.matmul(q, k) / self.scale
|
q = q.permute(1, 2, 0, 3) # (batch, head, tgt_len, head_dim)
|
||||||
|
k = k.reshape(src_len, batch, num_heads, head_dim)
|
||||||
|
k = k.permute(1, 2, 3, 0) # (batch, head, head_dim, src_len)
|
||||||
|
v = v.reshape(src_len, batch, num_heads, head_dim)
|
||||||
|
v = v.reshape(src_len, batch * num_heads, head_dim).transpose(0, 1)
|
||||||
|
|
||||||
# attn_output_weights = torch.matmul(q, k)
|
# Note: could remove the scaling operation when using ScaledAdam
|
||||||
# # This is a harder way of limiting the attention scores to not be too large.
|
# (batch, head, tgt_len, src_len)
|
||||||
# # It incurs a penalty if any of them has an absolute value greater than 50.0.
|
attn_weights = torch.matmul(q, k) / math.sqrt(head_dim)
|
||||||
# # this should be outside the normal range of the attention scores. We use
|
|
||||||
# # this mechanism instead of, say, a limit on entropy, because once the entropy
|
|
||||||
# # gets very small gradients through the softmax can become very small, and
|
|
||||||
# # some mechanisms like that become ineffective.
|
|
||||||
attn_output_weights = penalize_abs_values_gt(
|
|
||||||
attn_output_weights, limit=50.0, penalty=1.0e-04
|
|
||||||
)
|
|
||||||
|
|
||||||
if mask is not None:
|
# From zipformer.py:
|
||||||
attn_output_weights = attn_output_weights.masked_fill(
|
# This is a harder way of limiting the attention scores to not be too large.
|
||||||
mask.unsqueeze(1), float("-inf")
|
# It incurs a penalty if any of them has an absolute value greater than 50.0.
|
||||||
|
# this should be outside the normal range of the attention scores. We use
|
||||||
|
# this mechanism instead of, say, a limit on entropy, because once the entropy
|
||||||
|
# gets very small gradients through the softmax can become very small, and
|
||||||
|
# some mechanisms like that become ineffective.
|
||||||
|
attn_weights = penalize_abs_values_gt(attn_weights, limit=50.0, penalty=1.0e-04)
|
||||||
|
|
||||||
|
if key_padding_mask is not None:
|
||||||
|
assert key_padding_mask.shape == (batch, src_len), key_padding_mask.shape
|
||||||
|
attn_weights = attn_weights.masked_fill(
|
||||||
|
key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"),
|
||||||
)
|
)
|
||||||
|
|
||||||
attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
|
if attn_mask is not None:
|
||||||
|
assert (
|
||||||
|
attn_mask.shape == (batch, 1, src_len)
|
||||||
|
or attn_mask.shape == (batch, tgt_len, src_len)
|
||||||
|
), attn_mask.shape
|
||||||
|
attn_weights = attn_weights.masked_fill(attn_mask.unsqueeze(1), float("-inf"))
|
||||||
|
|
||||||
attn_output_weights = nn.functional.softmax(attn_output_weights, dim=-1)
|
attn_weights = attn_weights.view(batch * num_heads, tgt_len, src_len)
|
||||||
attn_output_weights = nn.functional.dropout(
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
||||||
attn_output_weights, p=self.dropout, training=self.training
|
|
||||||
|
attn_weights = nn.functional.dropout(
|
||||||
|
attn_weights, p=self.dropout, training=self.training
|
||||||
)
|
)
|
||||||
|
|
||||||
# (bsz * head, time1, head_dim_v)
|
# (batch * head, tgt_len, head_dim)
|
||||||
attn_output = torch.bmm(attn_output_weights, v)
|
attn_output = torch.bmm(attn_weights, v)
|
||||||
assert attn_output.shape == (bsz * num_heads, tgt_len, head_dim)
|
assert attn_output.shape == (batch * num_heads, tgt_len, head_dim), attn_output.shape
|
||||||
attn_output = (
|
|
||||||
attn_output.reshape(bsz, num_heads, tgt_len, head_dim)
|
attn_output = attn_output.transpose(0, 1).contiguous()
|
||||||
.transpose(1, 2)
|
attn_output = attn_output.view(tgt_len, batch, num_heads * head_dim)
|
||||||
.reshape(bsz, tgt_len, self.attention_dim)
|
|
||||||
)
|
# (batch, tgt_len, embed_dim)
|
||||||
attn_output = self.out_proj(attn_output)
|
attn_output = self.out_proj(attn_output)
|
||||||
|
|
||||||
return attn_output
|
return attn_output
|
||||||
@ -488,7 +550,7 @@ def _test_attention_decoder_model():
|
|||||||
decoder_dim=512,
|
decoder_dim=512,
|
||||||
num_decoder_layers=6,
|
num_decoder_layers=6,
|
||||||
attention_dim=512,
|
attention_dim=512,
|
||||||
nhead=8,
|
num_heads=8,
|
||||||
feedforward_dim=2048,
|
feedforward_dim=2048,
|
||||||
memory_dim=384,
|
memory_dim=384,
|
||||||
dropout=0.1,
|
dropout=0.1,
|
||||||
|
@ -662,7 +662,7 @@ def get_attention_decoder_model(params: AttributeDict) -> nn.Module:
|
|||||||
decoder_dim=params.attention_decoder_dim,
|
decoder_dim=params.attention_decoder_dim,
|
||||||
num_decoder_layers=params.attention_decoder_num_layers,
|
num_decoder_layers=params.attention_decoder_num_layers,
|
||||||
attention_dim=params.attention_decoder_attention_dim,
|
attention_dim=params.attention_decoder_attention_dim,
|
||||||
nhead=params.attention_decoder_num_heads,
|
num_heads=params.attention_decoder_num_heads,
|
||||||
feedforward_dim=params.attention_decoder_feedforward_dim,
|
feedforward_dim=params.attention_decoder_feedforward_dim,
|
||||||
memory_dim=max(_to_int_tuple(params.encoder_dim)),
|
memory_dim=max(_to_int_tuple(params.encoder_dim)),
|
||||||
sos_id=params.sos_id,
|
sos_id=params.sos_id,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user