mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-08 16:44:20 +00:00
Revers pruned_transducer_stateless4 to upstream/master
This commit is contained in:
parent
741dcd1d6d
commit
c7cf229f56
File diff suppressed because it is too large
Load Diff
1
egs/librispeech/ASR/pruned_transducer_stateless4/conformer.py
Symbolic link
1
egs/librispeech/ASR/pruned_transducer_stateless4/conformer.py
Symbolic link
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless2/conformer.py
|
@ -1,102 +0,0 @@
|
||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
"""This class modifies the stateless decoder from the following paper:
|
||||
|
||||
RNN-transducer with stateless prediction network
|
||||
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
|
||||
|
||||
It removes the recurrent connection from the decoder, i.e., the prediction
|
||||
network. Different from the above paper, it adds an extra Conv1d
|
||||
right after the embedding layer.
|
||||
|
||||
TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int,
|
||||
decoder_dim: int,
|
||||
blank_id: int,
|
||||
context_size: int,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
vocab_size:
|
||||
Number of tokens of the modeling unit including blank.
|
||||
decoder_dim:
|
||||
Dimension of the input embedding, and of the decoder output.
|
||||
blank_id:
|
||||
The ID of the blank symbol.
|
||||
context_size:
|
||||
Number of previous words to use to predict the next word.
|
||||
1 means bigram; 2 means trigram. n means (n+1)-gram.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.embedding = nn.Embedding(
|
||||
num_embeddings=vocab_size,
|
||||
embedding_dim=decoder_dim,
|
||||
padding_idx=blank_id,
|
||||
)
|
||||
self.blank_id = blank_id
|
||||
|
||||
assert context_size >= 1, context_size
|
||||
self.context_size = context_size
|
||||
self.vocab_size = vocab_size
|
||||
if context_size > 1:
|
||||
self.conv = nn.Conv1d(
|
||||
in_channels=decoder_dim,
|
||||
out_channels=decoder_dim,
|
||||
kernel_size=context_size,
|
||||
padding=0,
|
||||
groups=decoder_dim,
|
||||
bias=False,
|
||||
)
|
||||
|
||||
def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
y:
|
||||
A 2-D tensor of shape (N, U).
|
||||
need_pad:
|
||||
True to left pad the input. Should be True during training.
|
||||
False to not pad the input. Should be False during inference.
|
||||
Returns:
|
||||
Return a tensor of shape (N, U, decoder_dim).
|
||||
"""
|
||||
y = y.to(torch.int64)
|
||||
embedding_out = self.embedding(y)
|
||||
if self.context_size > 1:
|
||||
embedding_out = embedding_out.permute(0, 2, 1)
|
||||
if need_pad is True:
|
||||
embedding_out = F.pad(
|
||||
embedding_out, pad=(self.context_size - 1, 0)
|
||||
)
|
||||
else:
|
||||
# During inference time, there is no need to do extra padding
|
||||
# as we only need one output
|
||||
assert embedding_out.size(-1) == self.context_size
|
||||
embedding_out = self.conv(embedding_out)
|
||||
embedding_out = embedding_out.permute(0, 2, 1)
|
||||
embedding_out = F.relu(embedding_out)
|
||||
return embedding_out
|
1
egs/librispeech/ASR/pruned_transducer_stateless4/decoder.py
Symbolic link
1
egs/librispeech/ASR/pruned_transducer_stateless4/decoder.py
Symbolic link
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless2/decoder.py
|
@ -1,66 +0,0 @@
|
||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class Joiner(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
encoder_dim: int,
|
||||
decoder_dim: int,
|
||||
joiner_dim: int,
|
||||
vocab_size: int,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.encoder_proj = nn.Linear(encoder_dim, joiner_dim)
|
||||
self.decoder_proj = nn.Linear(decoder_dim, joiner_dim)
|
||||
self.output_linear = nn.Linear(joiner_dim, vocab_size)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
encoder_out: torch.Tensor,
|
||||
decoder_out: torch.Tensor,
|
||||
project_input: bool = True,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
encoder_out:
|
||||
Output from the encoder. Its shape is (N, T, s_range, C).
|
||||
decoder_out:
|
||||
Output from the decoder. Its shape is (N, T, s_range, C).
|
||||
project_input:
|
||||
If true, apply input projections encoder_proj and decoder_proj.
|
||||
If this is false, it is the user's responsibility to do this
|
||||
manually.
|
||||
Returns:
|
||||
Return a tensor of shape (N, T, s_range, C).
|
||||
"""
|
||||
assert encoder_out.ndim == decoder_out.ndim == 4
|
||||
assert encoder_out.shape[:-1] == decoder_out.shape[:-1]
|
||||
|
||||
if project_input:
|
||||
logit = self.encoder_proj(encoder_out) + self.decoder_proj(
|
||||
decoder_out
|
||||
)
|
||||
else:
|
||||
logit = encoder_out + decoder_out
|
||||
|
||||
logit = self.output_linear(torch.tanh(logit))
|
||||
|
||||
return logit
|
1
egs/librispeech/ASR/pruned_transducer_stateless4/joiner.py
Symbolic link
1
egs/librispeech/ASR/pruned_transducer_stateless4/joiner.py
Symbolic link
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless2/joiner.py
|
@ -1,192 +0,0 @@
|
||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, Wei Kang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import k2
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from encoder_interface import EncoderInterface
|
||||
|
||||
from icefall.utils import add_sos
|
||||
|
||||
|
||||
class Transducer(nn.Module):
|
||||
"""It implements https://arxiv.org/pdf/1211.3711.pdf
|
||||
"Sequence Transduction with Recurrent Neural Networks"
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encoder: EncoderInterface,
|
||||
decoder: nn.Module,
|
||||
joiner: nn.Module,
|
||||
encoder_dim: int,
|
||||
decoder_dim: int,
|
||||
joiner_dim: int,
|
||||
vocab_size: int,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
encoder:
|
||||
It is the transcription network in the paper. Its accepts
|
||||
two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
|
||||
It returns two tensors: `logits` of shape (N, T, encoder_dm) and
|
||||
`logit_lens` of shape (N,).
|
||||
decoder:
|
||||
It is the prediction network in the paper. Its input shape
|
||||
is (N, U) and its output shape is (N, U, decoder_dim).
|
||||
It should contain one attribute: `blank_id`.
|
||||
joiner:
|
||||
It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
|
||||
Its output shape is (N, T, U, vocab_size). Note that its output contains
|
||||
unnormalized probs, i.e., not processed by log-softmax.
|
||||
"""
|
||||
super().__init__()
|
||||
assert isinstance(encoder, EncoderInterface), type(encoder)
|
||||
assert hasattr(decoder, "blank_id")
|
||||
|
||||
self.encoder = encoder
|
||||
self.decoder = decoder
|
||||
self.joiner = joiner
|
||||
|
||||
self.simple_am_proj = nn.Linear(
|
||||
encoder_dim, vocab_size,
|
||||
)
|
||||
self.simple_lm_proj = nn.Linear(decoder_dim, vocab_size)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_lens: torch.Tensor,
|
||||
y: k2.RaggedTensor,
|
||||
prune_range: int = 5,
|
||||
am_scale: float = 0.0,
|
||||
lm_scale: float = 0.0,
|
||||
warmup: float = 1.0,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
x:
|
||||
A 3-D tensor of shape (N, T, C).
|
||||
x_lens:
|
||||
A 1-D tensor of shape (N,). It contains the number of frames in `x`
|
||||
before padding.
|
||||
y:
|
||||
A ragged tensor with 2 axes [utt][label]. It contains labels of each
|
||||
utterance.
|
||||
prune_range:
|
||||
The prune range for rnnt loss, it means how many symbols(context)
|
||||
we are considering for each frame to compute the loss.
|
||||
am_scale:
|
||||
The scale to smooth the loss with am (output of encoder network)
|
||||
part
|
||||
lm_scale:
|
||||
The scale to smooth the loss with lm (output of predictor network)
|
||||
part
|
||||
warmup:
|
||||
A value warmup >= 0 that determines which modules are active, values
|
||||
warmup > 1 "are fully warmed up" and all modules will be active.
|
||||
Returns:
|
||||
Return the transducer loss.
|
||||
|
||||
Note:
|
||||
Regarding am_scale & lm_scale, it will make the loss-function one of
|
||||
the form:
|
||||
lm_scale * lm_probs + am_scale * am_probs +
|
||||
(1-lm_scale-am_scale) * combined_probs
|
||||
"""
|
||||
assert x.ndim == 3, x.shape
|
||||
assert x_lens.ndim == 1, x_lens.shape
|
||||
assert y.num_axes == 2, y.num_axes
|
||||
|
||||
assert x.size(0) == x_lens.size(0) == y.dim0
|
||||
|
||||
encoder_out, x_lens = self.encoder(x, x_lens, warmup=warmup)
|
||||
assert torch.all(x_lens > 0)
|
||||
|
||||
# Now for the decoder, i.e., the prediction network
|
||||
row_splits = y.shape.row_splits(1)
|
||||
y_lens = row_splits[1:] - row_splits[:-1]
|
||||
|
||||
blank_id = self.decoder.blank_id
|
||||
sos_y = add_sos(y, sos_id=blank_id)
|
||||
|
||||
# sos_y_padded: [B, S + 1], start with SOS.
|
||||
sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
|
||||
|
||||
# decoder_out: [B, S + 1, decoder_dim]
|
||||
decoder_out = self.decoder(sos_y_padded)
|
||||
|
||||
# Note: y does not start with SOS
|
||||
# y_padded : [B, S]
|
||||
y_padded = y.pad(mode="constant", padding_value=0)
|
||||
|
||||
y_padded = y_padded.to(torch.int64)
|
||||
boundary = torch.zeros(
|
||||
(x.size(0), 4), dtype=torch.int64, device=x.device
|
||||
)
|
||||
boundary[:, 2] = y_lens
|
||||
boundary[:, 3] = x_lens
|
||||
|
||||
lm = self.simple_lm_proj(decoder_out)
|
||||
am = self.simple_am_proj(encoder_out)
|
||||
|
||||
with torch.cuda.amp.autocast(enabled=False):
|
||||
simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
|
||||
lm=lm.float(),
|
||||
am=am.float(),
|
||||
symbols=y_padded,
|
||||
termination_symbol=blank_id,
|
||||
lm_only_scale=lm_scale,
|
||||
am_only_scale=am_scale,
|
||||
boundary=boundary,
|
||||
reduction="sum",
|
||||
return_grad=True,
|
||||
)
|
||||
|
||||
# ranges : [B, T, prune_range]
|
||||
ranges = k2.get_rnnt_prune_ranges(
|
||||
px_grad=px_grad,
|
||||
py_grad=py_grad,
|
||||
boundary=boundary,
|
||||
s_range=prune_range,
|
||||
)
|
||||
|
||||
# am_pruned : [B, T, prune_range, encoder_dim]
|
||||
# lm_pruned : [B, T, prune_range, decoder_dim]
|
||||
am_pruned, lm_pruned = k2.do_rnnt_pruning(
|
||||
am=self.joiner.encoder_proj(encoder_out),
|
||||
lm=self.joiner.decoder_proj(decoder_out),
|
||||
ranges=ranges,
|
||||
)
|
||||
|
||||
# logits : [B, T, prune_range, vocab_size]
|
||||
|
||||
# project_input=False since we applied the decoder's input projections
|
||||
# prior to do_rnnt_pruning (this is an optimization for speed).
|
||||
logits = self.joiner(am_pruned, lm_pruned, project_input=False)
|
||||
|
||||
with torch.cuda.amp.autocast(enabled=False):
|
||||
pruned_loss = k2.rnnt_loss_pruned(
|
||||
logits=logits.float(),
|
||||
symbols=y_padded,
|
||||
ranges=ranges,
|
||||
termination_symbol=blank_id,
|
||||
boundary=boundary,
|
||||
reduction="sum",
|
||||
)
|
||||
|
||||
return (simple_loss, pruned_loss)
|
1
egs/librispeech/ASR/pruned_transducer_stateless4/model.py
Symbolic link
1
egs/librispeech/ASR/pruned_transducer_stateless4/model.py
Symbolic link
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless2/model.py
|
@ -1,802 +0,0 @@
|
||||
# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey)
|
||||
#
|
||||
# See ../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from typing import List, Optional, Union, Tuple, List
|
||||
from lhotse.utils import fix_random_seed
|
||||
import torch
|
||||
from torch import Tensor
|
||||
from torch.optim import Optimizer
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class Cain(Optimizer):
|
||||
"""Implements Cain algorithm, which is a substantially modified form of the
|
||||
Adam optimizer.
|
||||
The modifications can be summarized as follows:
|
||||
|
||||
(i) change Adam to what we called "Eve", which adds an extra configuration
|
||||
value, "target_rms" (default value: 0.1); and for non-scalar parameters,
|
||||
if their root-mean-square value exceeds "target_rms", we apply weight
|
||||
decay (aggressive enough weight decay that the rms stays at target_rms).
|
||||
This weight decay is applied in the same way as in AdamW, i.e. by
|
||||
parameter shrinkage rather than by modifying the gradients.
|
||||
(ii) By limiting the rms to 0.1, we reduce our modeling power; we restore
|
||||
it by replacing all weights and biases-- in fact, all model parameters--
|
||||
with expressions like
|
||||
weight * weight_scale.exp() or bias * bias_scale.exp().
|
||||
This has the benefit of making the learnable offsets and scales of
|
||||
BatchNorm/LayerNorm unnecessary.
|
||||
Below we describe the changes from Eve to Cain:
|
||||
(iii) We removed all the weight_scale and bias_scale parameters from
|
||||
the model and "moved them into the optimizer". Now we allow
|
||||
parameters to have any rms value (that's <= 10), but we:
|
||||
(a) make the update speed for a parameter proportional to
|
||||
its rms value, using target_rms (0.1) as the reference point
|
||||
where we just use the standard Adam-type formula.
|
||||
(b) simulate the effect of learning the weight_scale or bias_scale
|
||||
in log space using standard Adam. (see scale_exp_avg_sq in the
|
||||
code below).
|
||||
(iv) Periodically (every 2000 batches) perform a "change of basis" on
|
||||
all non-convolutional axes of each parameter tensor, and reset the
|
||||
Adam statistics. This means an orthogonal change of co-ordinates
|
||||
in the space corresponding to each dimension of a tensor, e.g. in
|
||||
a 256 x 512 parameter tensor we'll have a 256x256 orthogonal matrix
|
||||
and a 512x512 one. This does a better job of separating "important"
|
||||
from "unimportant" directions in activation space, and empirically
|
||||
improved convergence, final loss function and WERs.
|
||||
|
||||
The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
|
||||
The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
|
||||
Eve is unpublished so far.
|
||||
|
||||
Arguments:
|
||||
params (iterable): iterable of parameters to optimize or dicts defining
|
||||
parameter groups
|
||||
lr (float, optional): learning rate (default: 1e-3)
|
||||
betas (Tuple[float, float], optional): coefficients used for computing
|
||||
running averages of gradient and its square (default: (0.9, 0.999))
|
||||
eps (float, optional): term added to the denominator to improve
|
||||
numerical stability (default: 1e-8)
|
||||
target_rms (float, optional): target root-mean-square value of
|
||||
parameters, if they fall below this we will stop applying weight decay.
|
||||
rms_eps (float, optional): epsilon value such that when parameter
|
||||
rms value goes below this, we stop learning slower for that parameter,
|
||||
i.e. we treat the rms as if it were rms_eps. In practice, rms values
|
||||
will not go much below this.
|
||||
rms_max (float, optional): maximum root-mean-square value for
|
||||
non-scalar parameters, such that we don't let the parameter
|
||||
values exceed this; we'll shrink any parameter matrices that becomes
|
||||
larger than this.
|
||||
|
||||
|
||||
|
||||
|
||||
.. _Adam\: A Method for Stochastic Optimization:
|
||||
https://arxiv.org/abs/1412.6980
|
||||
.. _Decoupled Weight Decay Regularization:
|
||||
https://arxiv.org/abs/1711.05101
|
||||
.. _On the Convergence of Adam and Beyond:
|
||||
https://openreview.net/forum?id=ryQu7f-RZ
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
params,
|
||||
lr=1e-3,
|
||||
betas=(0.9, 0.98),
|
||||
eps=1e-8,
|
||||
target_rms=0.1,
|
||||
rms_eps=1.0e-05,
|
||||
rms_max=10.0,
|
||||
):
|
||||
|
||||
if not 0.0 <= lr:
|
||||
raise ValueError("Invalid learning rate: {}".format(lr))
|
||||
if not 0.0 <= eps:
|
||||
raise ValueError("Invalid epsilon value: {}".format(eps))
|
||||
if not 0.0 <= betas[0] < 1.0:
|
||||
raise ValueError(
|
||||
"Invalid beta parameter at index 0: {}".format(betas[0])
|
||||
)
|
||||
if not 0.0 <= betas[1] < 1.0:
|
||||
raise ValueError(
|
||||
"Invalid beta parameter at index 1: {}".format(betas[1])
|
||||
)
|
||||
if not 0 < target_rms <= 10.0:
|
||||
raise ValueError("Invalid target_rms value: {}".format(target_rms))
|
||||
if not 0.1 < rms_max <= 1000.0:
|
||||
raise ValueError("Invalid rms_max value: {}".format(rms_max))
|
||||
if not 0.0 < rms_eps < 0.1:
|
||||
raise ValueError("Invalid rms_eps value: {}".format(rms_eps))
|
||||
|
||||
defaults = dict(
|
||||
lr=lr,
|
||||
betas=betas,
|
||||
eps=eps,
|
||||
target_rms=target_rms,
|
||||
rms_eps=rms_eps,
|
||||
rms_max=rms_max,
|
||||
)
|
||||
super(Cain, self).__init__(params, defaults)
|
||||
|
||||
def __setstate__(self, state):
|
||||
super(Cain, self).__setstate__(state)
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
with torch.enable_grad():
|
||||
loss = closure()
|
||||
|
||||
for group in self.param_groups:
|
||||
beta1, beta2 = group["betas"]
|
||||
lr = group["lr"]
|
||||
target_rms = group["target_rms"]
|
||||
rms_eps = group["rms_eps"]
|
||||
eps = group["eps"]
|
||||
rms_max = group["rms_max"]
|
||||
|
||||
for p in group["params"]:
|
||||
if p.grad is None:
|
||||
continue
|
||||
|
||||
# Perform optimization step
|
||||
grad = p.grad
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError(
|
||||
"AdamW does not support sparse gradients"
|
||||
)
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
state["step"] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state["delta"] = torch.zeros_like(
|
||||
p, memory_format=torch.preserve_format
|
||||
)
|
||||
# Exponential moving average of squared gradient values
|
||||
state["exp_avg_sq"] = torch.zeros_like(
|
||||
p, memory_format=torch.preserve_format
|
||||
)
|
||||
if p.numel() > 1:
|
||||
# we keep track of the RMS value of the parameters to
|
||||
# help set the learning rate... this emulates Eve.
|
||||
state["param_rms"] = (p**2).mean().sqrt()
|
||||
# we learn the scale on the parameters in a way
|
||||
# that also emulates Eve.
|
||||
state["scale_exp_avg_sq"] = torch.zeros((), device=p.device,
|
||||
dtype=p.dtype)
|
||||
# alpha is a scale related to the natural-gradient update,
|
||||
# see self._get_denom()
|
||||
state["alpha"] = torch.ones((), device=p.device,
|
||||
dtype=p.dtype)
|
||||
|
||||
|
||||
|
||||
delta, exp_avg_sq = state["delta"], state["exp_avg_sq"]
|
||||
step = state["step"]
|
||||
|
||||
delta.mul_(beta1)
|
||||
|
||||
if p.numel() > 1:
|
||||
# This part learns the scale on the parameters.
|
||||
# scale_deriv is the derivative w.r.t. a log-space scale on the parameters, i.e.
|
||||
# if we imagine: p = underlying_param * scale.exp(), this is the derivative
|
||||
# w.r.t. scale (it does not actually matter what value `scale` currently has).
|
||||
scale_deriv = (p * grad).sum()
|
||||
scale_exp_avg_sq = state["scale_exp_avg_sq"]
|
||||
scale_exp_avg_sq.mul_(beta2).addcmul_(scale_deriv, scale_deriv,
|
||||
value=1 - beta2)
|
||||
|
||||
# should actually be step + 1, so on 1st minibatch we are not learning
|
||||
# anything here. May fix this at some point.
|
||||
scale_bias_correction2 = 1 - beta2 ** (step + 1)
|
||||
|
||||
scale_denom = (scale_exp_avg_sq.sqrt()).add_(eps)
|
||||
|
||||
scale_alpha = -lr * (scale_bias_correction2 ** 0.5) * (1-beta1)
|
||||
|
||||
# scale_delta is going to be the change in log-scale (before momentum), i.e. if
|
||||
# p = underlying_param * scale.exp(),
|
||||
# delta is the change in `scale`.
|
||||
scale_delta = scale_alpha * (scale_deriv / scale_denom)
|
||||
|
||||
# the following is equivalent to:
|
||||
# if torch.all(state["param_rms"] > rms_max):
|
||||
# delta = -1.0e-03
|
||||
# which means: if the parameter root-mean-square value goes above the
|
||||
# specified limit, start to decay the parameters (and ignore the computed
|
||||
# delta).
|
||||
scale_delta = torch.where(state["param_rms"] > rms_max,
|
||||
torch.tensor(-1.0e-03 * (1-beta1),
|
||||
device=scale_delta.device,
|
||||
dtype=scale_delta.dtype),
|
||||
scale_delta)
|
||||
delta.add_(p, alpha=scale_delta)
|
||||
|
||||
|
||||
# forget bias_correction1. We use normal momentum. delta really
|
||||
# just stores the moving-average gradient step.
|
||||
bias_correction2 = 1 - beta2 ** self._step_for_bias_correction(step)
|
||||
|
||||
grad = self._change_coordinates(grad, state, forward=True)
|
||||
|
||||
# Decay the second moment running average coefficient
|
||||
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
|
||||
|
||||
denom = (exp_avg_sq.sqrt()).add_(eps)
|
||||
|
||||
this_delta = grad / denom
|
||||
|
||||
this_delta = self._change_coordinates(this_delta, state, forward=False)
|
||||
|
||||
alpha = -lr*(1-beta1)*(bias_correction2 ** 0.5)
|
||||
if p.numel() > 1:
|
||||
if step % 10 == 0:
|
||||
state["param_rms"].fill_((p**2).mean().sqrt())
|
||||
# imagine param was normalized to rms=target_rms and we stored the
|
||||
# scale as a separate scalar. This is how fast we'd be learning.
|
||||
alpha = (alpha / target_rms) * state["param_rms"].clamp(min=rms_eps)
|
||||
|
||||
delta.add_(this_delta, alpha=alpha)
|
||||
|
||||
p.add_(delta)
|
||||
state["step"] += 1
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
def _change_coordinates(self,
|
||||
grad: Tensor,
|
||||
state: dict,
|
||||
forward: bool):
|
||||
"""
|
||||
Possibly performs some magnitude-preserving changes of co-ordinates on `grad`
|
||||
"""
|
||||
ndim = grad.ndim
|
||||
numel = grad.numel()
|
||||
step = state["step"]
|
||||
for dim in range(grad.ndim):
|
||||
# see for each dim in turn whether we want to perform any changes in co-ordinates,
|
||||
# or store any stats.
|
||||
size = grad.shape[dim]
|
||||
if size <= 3 or (size % 2 == 1 and size < 128) or size >= 2048 or size == numel:
|
||||
# we don't do any such co-ordinate changes in dims with sizes
|
||||
# that are too small (no point) or large (too slow), or that are
|
||||
# assumed convolutional (because they are odd and not too huge).
|
||||
# We can revisit this later.
|
||||
continue
|
||||
grad = self._change_coordinates_for_dim(grad, state, dim, forward)
|
||||
return grad
|
||||
|
||||
|
||||
def _change_coordinates_for_dim(self,
|
||||
grad: Tensor,
|
||||
state: dict,
|
||||
dim: int,
|
||||
forward: bool,
|
||||
orig_dim: int = -1):
|
||||
assert grad.ndim > 1
|
||||
if not (grad.ndim == 2 and dim == 1):
|
||||
# normalize the format and recurse.
|
||||
dims_order = []
|
||||
rev_dims_order = []
|
||||
ndim = grad.ndim
|
||||
for i in range(dim):
|
||||
dims_order.append(i)
|
||||
rev_dims_order.append(i)
|
||||
rev_dims_order.append(ndim-1)
|
||||
for i in range(dim+1, ndim):
|
||||
dims_order.append(i)
|
||||
rev_dims_order.append(i-1)
|
||||
dims_order.append(dim)
|
||||
# e.g. ndim=4, dim=1, dims_order=(0,2,3,1), rev_dims_order=(0,3,1,2)
|
||||
new_grad = grad.permute(*dims_order)
|
||||
assert new_grad.shape[-1] == grad.shape[dim]
|
||||
new_shape = new_grad.shape
|
||||
new_grad = new_grad.reshape(-1, new_grad.shape[-1])
|
||||
new_grad = self._change_coordinates_for_dim(new_grad, state, 1, forward,
|
||||
orig_dim=dim)
|
||||
return new_grad.reshape(new_shape).permute(*rev_dims_order)
|
||||
|
||||
# OK: grad.ndim == 2 and dim == 1
|
||||
size = grad.shape[1]
|
||||
if orig_dim == -1:
|
||||
orig_dim = dim
|
||||
step = state["step"]
|
||||
must_store_stats, must_zero_stats = self._must_store_stats(step)
|
||||
if must_store_stats and forward:
|
||||
# store stats for 200 iters preceding when we estimate the
|
||||
# transform.
|
||||
stats_name = f"cov_{orig_dim}"
|
||||
if not stats_name in state:
|
||||
state[stats_name] = torch.zeros(size, size, dtype=grad.dtype,
|
||||
device=grad.device)
|
||||
cov = state[stats_name]
|
||||
if must_zero_stats:
|
||||
cov.zero_()
|
||||
cov += torch.matmul(grad.t(), grad) * (1/grad.shape[0])
|
||||
|
||||
transform_name = f"t_{orig_dim}"
|
||||
if transform_name not in state:
|
||||
# make sure they're all allocated at the start, may be better for
|
||||
# fragmention and debugging reasons (although may not).
|
||||
state[transform_name] = torch.eye(size, device=grad.device,
|
||||
dtype=grad.dtype)
|
||||
|
||||
must_estimate_transform = self._must_estimate_transform(step)
|
||||
if must_estimate_transform and forward:
|
||||
stats_name = f"cov_{orig_dim}"
|
||||
cov = state[stats_name]
|
||||
l, U = cov.symeig(eigenvectors=True)
|
||||
#print("estimate, l = ", l[::20])
|
||||
t = U.t() # t is indexed (new_dim, old_dim) a.k.a. (transformed_dim, real_dim)
|
||||
state[transform_name][:] = t
|
||||
state["exp_avg_sq"].zero_() # zero exp-avg-sq stats, we'll restart these from scratch.
|
||||
|
||||
t = state[transform_name]
|
||||
if forward:
|
||||
return torch.matmul(grad, t.t())
|
||||
else:
|
||||
return torch.matmul(grad, t)
|
||||
|
||||
def _must_store_stats(self, step: int) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Returns (must_store_stats, must_zero_stats), where
|
||||
must_store_stats: bool, is True for the 100 or 200 steps preceding
|
||||
a step on which we will estimate the transform.
|
||||
must_zero_stats: bool, is True for only the 1st of the 100 or 200
|
||||
steps mentioned above.
|
||||
"""
|
||||
if step < 4000:
|
||||
if step < 2000:
|
||||
return (step % 500 >= 400, step % 500 == 400)
|
||||
else:
|
||||
return (step % 1000 >= 800, step % 1000 == 800)
|
||||
else:
|
||||
return (step % 2000 >= 1800, step % 2000 == 1800)
|
||||
|
||||
def _must_estimate_transform(self, step: int) -> bool:
|
||||
"""
|
||||
Returns True if we will re-estimate the transform on this step
|
||||
"""
|
||||
if step < 4000:
|
||||
if step < 2000:
|
||||
return step % 500 == 0 and step > 0
|
||||
else:
|
||||
return step % 1000 == 0 and step > 0
|
||||
else:
|
||||
return step % 2000 == 0
|
||||
|
||||
def _step_for_bias_correction(self, step: int) -> bool:
|
||||
"""
|
||||
Return a modified step for bias-correction (actually just to give us the
|
||||
right denominator for the exp-avg-sq stats).. this needs to take into
|
||||
account how long it has been since we re-estimated the transforms, because
|
||||
we zero the exp-avg-sq stats at those times.
|
||||
|
||||
The + 1 is for the "current step". We post-increment the step so it
|
||||
starts from 0.
|
||||
"""
|
||||
if step < 4000:
|
||||
if step < 2000:
|
||||
return step % 500 + 1
|
||||
else:
|
||||
return step % 1000 + 1
|
||||
else:
|
||||
return step % 2000 + 1
|
||||
|
||||
|
||||
|
||||
class LRScheduler(object):
|
||||
"""
|
||||
Base-class for learning rate schedulers where the learning-rate depends on both the
|
||||
batch and the epoch.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer: Optimizer, verbose: bool = False):
|
||||
# Attach optimizer
|
||||
if not isinstance(optimizer, Optimizer):
|
||||
raise TypeError(
|
||||
"{} is not an Optimizer".format(type(optimizer).__name__)
|
||||
)
|
||||
self.optimizer = optimizer
|
||||
self.verbose = verbose
|
||||
|
||||
for group in optimizer.param_groups:
|
||||
group.setdefault("initial_lr", group["lr"])
|
||||
|
||||
self.base_lrs = [
|
||||
group["initial_lr"] for group in optimizer.param_groups
|
||||
]
|
||||
|
||||
self.epoch = 0
|
||||
self.batch = 0
|
||||
|
||||
def state_dict(self):
|
||||
"""Returns the state of the scheduler as a :class:`dict`.
|
||||
|
||||
It contains an entry for every variable in self.__dict__ which
|
||||
is not the optimizer.
|
||||
"""
|
||||
return {
|
||||
"base_lrs": self.base_lrs,
|
||||
"epoch": self.epoch,
|
||||
"batch": self.batch,
|
||||
}
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
"""Loads the schedulers state.
|
||||
|
||||
Args:
|
||||
state_dict (dict): scheduler state. Should be an object returned
|
||||
from a call to :meth:`state_dict`.
|
||||
"""
|
||||
self.__dict__.update(state_dict)
|
||||
|
||||
def get_last_lr(self) -> List[float]:
|
||||
"""Return last computed learning rate by current scheduler. Will be a list of float."""
|
||||
return self._last_lr
|
||||
|
||||
def get_lr(self):
|
||||
# Compute list of learning rates from self.epoch and self.batch and
|
||||
# self.base_lrs; this must be overloaded by the user.
|
||||
# e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
|
||||
raise NotImplementedError
|
||||
|
||||
def step_batch(self, batch: Optional[int] = None) -> None:
|
||||
# Step the batch index, or just set it. If `batch` is specified, it
|
||||
# must be the batch index from the start of training, i.e. summed over
|
||||
# all epochs.
|
||||
# You can call this in any order; if you don't provide 'batch', it should
|
||||
# of course be called once per batch.
|
||||
if batch is not None:
|
||||
self.batch = batch
|
||||
else:
|
||||
self.batch = self.batch + 1
|
||||
self._set_lrs()
|
||||
|
||||
def step_epoch(self, epoch: Optional[int] = None):
|
||||
# Step the epoch index, or just set it. If you provide the 'epoch' arg,
|
||||
# you should call this at the start of the epoch; if you don't provide the 'epoch'
|
||||
# arg, you should call it at the end of the epoch.
|
||||
if epoch is not None:
|
||||
self.epoch = epoch
|
||||
else:
|
||||
self.epoch = self.epoch + 1
|
||||
self._set_lrs()
|
||||
|
||||
def _set_lrs(self):
|
||||
values = self.get_lr()
|
||||
assert len(values) == len(self.optimizer.param_groups)
|
||||
|
||||
for i, data in enumerate(zip(self.optimizer.param_groups, values)):
|
||||
param_group, lr = data
|
||||
param_group["lr"] = lr
|
||||
self.print_lr(self.verbose, i, lr)
|
||||
self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
|
||||
|
||||
def print_lr(self, is_verbose, group, lr):
|
||||
"""Display the current learning rate."""
|
||||
if is_verbose:
|
||||
print(
|
||||
f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
|
||||
f" of group {group} to {lr:.4e}."
|
||||
)
|
||||
|
||||
class Eve(Optimizer):
|
||||
"""
|
||||
Implements Eve algorithm. This is a modified version of AdamW with a special
|
||||
way of setting the weight-decay / shrinkage-factor, which is designed to make the
|
||||
rms of the parameters approach a particular target_rms (default: 0.1). This is
|
||||
for use with networks with 'scaled' versions of modules (see scaling.py), which
|
||||
will be close to invariant to the absolute scale on the parameter matrix.
|
||||
|
||||
The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
|
||||
The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
|
||||
Eve is unpublished so far.
|
||||
|
||||
Arguments:
|
||||
params (iterable): iterable of parameters to optimize or dicts defining
|
||||
parameter groups
|
||||
lr (float, optional): learning rate (default: 1e-3)
|
||||
betas (Tuple[float, float], optional): coefficients used for computing
|
||||
running averages of gradient and its square (default: (0.9, 0.999))
|
||||
eps (float, optional): term added to the denominator to improve
|
||||
numerical stability (default: 1e-8)
|
||||
weight_decay (float, optional): weight decay coefficient (default: 3e-4;
|
||||
this value means that the weight would decay significantly after
|
||||
about 3k minibatches. Is not multiplied by learning rate, but
|
||||
is conditional on RMS-value of parameter being > target_rms.
|
||||
target_rms (float, optional): target root-mean-square value of
|
||||
parameters, if they fall below this we will stop applying weight decay.
|
||||
|
||||
|
||||
.. _Adam\: A Method for Stochastic Optimization:
|
||||
https://arxiv.org/abs/1412.6980
|
||||
.. _Decoupled Weight Decay Regularization:
|
||||
https://arxiv.org/abs/1711.05101
|
||||
.. _On the Convergence of Adam and Beyond:
|
||||
https://openreview.net/forum?id=ryQu7f-RZ
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
params,
|
||||
lr=1e-3,
|
||||
betas=(0.9, 0.98),
|
||||
eps=1e-8,
|
||||
weight_decay=1e-3,
|
||||
target_rms=0.1,
|
||||
):
|
||||
|
||||
if not 0.0 <= lr:
|
||||
raise ValueError("Invalid learning rate: {}".format(lr))
|
||||
if not 0.0 <= eps:
|
||||
raise ValueError("Invalid epsilon value: {}".format(eps))
|
||||
if not 0.0 <= betas[0] < 1.0:
|
||||
raise ValueError(
|
||||
"Invalid beta parameter at index 0: {}".format(betas[0])
|
||||
)
|
||||
if not 0.0 <= betas[1] < 1.0:
|
||||
raise ValueError(
|
||||
"Invalid beta parameter at index 1: {}".format(betas[1])
|
||||
)
|
||||
if not 0 <= weight_decay <= 0.1:
|
||||
raise ValueError(
|
||||
"Invalid weight_decay value: {}".format(weight_decay)
|
||||
)
|
||||
if not 0 < target_rms <= 10.0:
|
||||
raise ValueError("Invalid target_rms value: {}".format(target_rms))
|
||||
defaults = dict(
|
||||
lr=lr,
|
||||
betas=betas,
|
||||
eps=eps,
|
||||
weight_decay=weight_decay,
|
||||
target_rms=target_rms,
|
||||
)
|
||||
super(Eve, self).__init__(params, defaults)
|
||||
|
||||
def __setstate__(self, state):
|
||||
super(Eve, self).__setstate__(state)
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
with torch.enable_grad():
|
||||
loss = closure()
|
||||
|
||||
for group in self.param_groups:
|
||||
for p in group["params"]:
|
||||
if p.grad is None:
|
||||
continue
|
||||
|
||||
# Perform optimization step
|
||||
grad = p.grad
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError(
|
||||
"AdamW does not support sparse gradients"
|
||||
)
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
state["step"] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state["exp_avg"] = torch.zeros_like(
|
||||
p, memory_format=torch.preserve_format
|
||||
)
|
||||
# Exponential moving average of squared gradient values
|
||||
state["exp_avg_sq"] = torch.zeros_like(
|
||||
p, memory_format=torch.preserve_format
|
||||
)
|
||||
|
||||
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
|
||||
|
||||
beta1, beta2 = group["betas"]
|
||||
|
||||
state["step"] += 1
|
||||
bias_correction1 = 1 - beta1 ** state["step"]
|
||||
bias_correction2 = 1 - beta2 ** state["step"]
|
||||
|
||||
# Decay the first and second moment running average coefficient
|
||||
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
|
||||
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
|
||||
denom = (exp_avg_sq.sqrt() * (bias_correction2 ** -0.5)).add_(
|
||||
group["eps"]
|
||||
)
|
||||
|
||||
step_size = group["lr"] / bias_correction1
|
||||
target_rms = group["target_rms"]
|
||||
weight_decay = group["weight_decay"]
|
||||
|
||||
if p.numel() > 1:
|
||||
# avoid applying this weight-decay on "scaling factors"
|
||||
# (which are scalar).
|
||||
is_above_target_rms = p.norm() > (
|
||||
target_rms * (p.numel() ** 0.5)
|
||||
)
|
||||
p.mul_(1 - (weight_decay * is_above_target_rms))
|
||||
|
||||
p.addcdiv_(exp_avg, denom, value=-step_size)
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
class Eden(LRScheduler):
|
||||
"""
|
||||
Eden scheduler.
|
||||
lr = initial_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
|
||||
(((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25))
|
||||
|
||||
E.g. suggest initial-lr = 0.003 (passed to optimizer).
|
||||
|
||||
Args:
|
||||
optimizer: the optimizer to change the learning rates on
|
||||
lr_batches: the number of batches after which we start significantly
|
||||
decreasing the learning rate, suggest 5000.
|
||||
lr_epochs: the number of epochs after which we start significantly
|
||||
decreasing the learning rate, suggest 6 if you plan to do e.g.
|
||||
20 to 40 epochs, but may need smaller number if dataset is huge
|
||||
and you will do few epochs.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
optimizer: Optimizer,
|
||||
lr_batches: Union[int, float],
|
||||
lr_epochs: Union[int, float],
|
||||
verbose: bool = False,
|
||||
):
|
||||
super(Eden, self).__init__(optimizer, verbose)
|
||||
self.lr_batches = lr_batches
|
||||
self.lr_epochs = lr_epochs
|
||||
|
||||
def get_lr(self):
|
||||
factor = (
|
||||
(self.batch ** 2 + self.lr_batches ** 2) / self.lr_batches ** 2
|
||||
) ** -0.25 * (
|
||||
((self.epoch ** 2 + self.lr_epochs ** 2) / self.lr_epochs ** 2)
|
||||
** -0.25
|
||||
)
|
||||
return [x * factor for x in self.base_lrs]
|
||||
|
||||
|
||||
def _test_eden():
|
||||
m = torch.nn.Linear(100, 100)
|
||||
optim = Cain(m.parameters(), lr=0.003)
|
||||
|
||||
scheduler = Eden(optim, lr_batches=30, lr_epochs=2, verbose=True)
|
||||
|
||||
for epoch in range(10):
|
||||
scheduler.step_epoch(epoch) # sets epoch to `epoch`
|
||||
|
||||
for step in range(20):
|
||||
x = torch.randn(200, 100).detach()
|
||||
x.requires_grad = True
|
||||
y = m(x)
|
||||
dy = torch.randn(200, 100).detach()
|
||||
f = (y * dy).sum()
|
||||
f.backward()
|
||||
|
||||
optim.step()
|
||||
scheduler.step_batch()
|
||||
optim.zero_grad()
|
||||
|
||||
print("last lr = ", scheduler.get_last_lr())
|
||||
print("state dict = ", scheduler.state_dict())
|
||||
|
||||
|
||||
def _test_eve_cain():
|
||||
import timeit
|
||||
from scaling import ScaledLinear
|
||||
E = 100
|
||||
B = 4
|
||||
T = 2
|
||||
print("in test_eve_cain")
|
||||
device = torch.device('cuda')
|
||||
dtype = torch.float32
|
||||
|
||||
# these input_magnitudes and output_magnitudes are to test that
|
||||
# Abel is working as we expect and is able to adjust scales of
|
||||
# different dims differently.
|
||||
input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
|
||||
output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
|
||||
|
||||
for iter in [1,0]:
|
||||
fix_random_seed(42)
|
||||
Linear = torch.nn.Linear if iter == 0 else ScaledLinear
|
||||
m = torch.nn.Sequential(Linear(E, 200),
|
||||
torch.nn.ReLU(),
|
||||
Linear(200, E)).to(device)
|
||||
|
||||
train_pairs = [ (100.0 * torch.randn(B, T, E, device=device, dtype=dtype) * input_magnitudes,
|
||||
torch.randn(B, T, E, device=device, dtype=dtype) * output_magnitudes) for _ in range(20) ]
|
||||
|
||||
if iter == 0: optim = Eve(m.parameters(), lr=0.003)
|
||||
else: optim = Cain(m.parameters(), lr=0.003)
|
||||
scheduler = Eden(optim, lr_batches=200, lr_epochs=10, verbose=False)
|
||||
|
||||
start = timeit.default_timer()
|
||||
for epoch in range(150):
|
||||
scheduler.step_epoch()
|
||||
for n, (x,y) in enumerate(train_pairs):
|
||||
y_out = m(x)
|
||||
loss = ((y_out - y)**2).mean() * 100.0
|
||||
if n == 0 and epoch % 10 == 0:
|
||||
norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
|
||||
norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()
|
||||
norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item()
|
||||
norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item()
|
||||
#scale1 = '%.2e' % (m[0].weight_scale.exp().item())
|
||||
#scale1b = '%.2e' % (m[0].bias_scale.exp().item())
|
||||
#scale2 = '%.2e' % (m[2].weight_scale.exp().item())
|
||||
#scale2b = '%.2e' % (m[2].bias_scale.exp().item())
|
||||
print(f"Iter {iter}, epoch {epoch}, batch {n}, loss {loss.item()}, norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b}
|
||||
loss.log().backward()
|
||||
optim.step()
|
||||
optim.zero_grad()
|
||||
scheduler.step_batch()
|
||||
|
||||
stop = timeit.default_timer()
|
||||
print(f"Iter={iter}, Time taken: {stop - start}")
|
||||
|
||||
print("last lr = ", scheduler.get_last_lr())
|
||||
#print("state dict = ", scheduler.state_dict())
|
||||
#print("optim state_dict = ", optim.state_dict())
|
||||
print("input_magnitudes = ", input_magnitudes)
|
||||
print("output_magnitudes = ", output_magnitudes)
|
||||
|
||||
def stddev(x):
|
||||
return ((x-x.mean())**2).mean().sqrt()
|
||||
print("Un-normalized input col magnitudes log-stddev: ", stddev((m[0].weight**2).sum(dim=0).sqrt().log()))
|
||||
print("Normalized input col magnitudes log-stddev: ", stddev(((m[0].weight**2).sum(dim=0).sqrt() * input_magnitudes).log()))
|
||||
|
||||
print("Un-normalized 0-output row magnitudes log-stddev: ", stddev((m[0].weight**2).sum(dim=1).sqrt().log()))
|
||||
print("Un-normalized 2-input col magnitudes log-stddev: ", stddev((m[2].weight**2).sum(dim=0).sqrt().log()))
|
||||
print("Un-normalized 2-output row magnitudes log-stddev: ", stddev((m[2].weight**2).sum(dim=1).sqrt().log()))
|
||||
|
||||
print("Normalized output row magnitudes log-stddev: ", stddev(((m[2].weight**2).sum(dim=1).sqrt() / output_magnitudes).log()))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
_test_eve_cain()
|
||||
#_test_eden()
|
1
egs/librispeech/ASR/pruned_transducer_stateless4/optim.py
Symbolic link
1
egs/librispeech/ASR/pruned_transducer_stateless4/optim.py
Symbolic link
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless2/optim.py
|
@ -1,415 +0,0 @@
|
||||
# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import collections
|
||||
from itertools import repeat
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch import Tensor
|
||||
from torch.nn import Embedding as ScaledEmbedding
|
||||
|
||||
|
||||
def _ntuple(n):
|
||||
def parse(x):
|
||||
if isinstance(x, collections.Iterable):
|
||||
return x
|
||||
return tuple(repeat(x, n))
|
||||
|
||||
return parse
|
||||
|
||||
|
||||
_single = _ntuple(1)
|
||||
_pair = _ntuple(2)
|
||||
|
||||
|
||||
class ActivationBalancerFunction(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
x: Tensor,
|
||||
channel_dim: int,
|
||||
min_positive: float, # e.g. 0.05
|
||||
max_positive: float, # e.g. 0.95
|
||||
max_factor: float, # e.g. 0.01
|
||||
min_abs: float, # e.g. 0.2
|
||||
max_abs: float, # e.g. 100.0
|
||||
) -> Tensor:
|
||||
if x.requires_grad:
|
||||
if channel_dim < 0:
|
||||
channel_dim += x.ndim
|
||||
sum_dims = [d for d in range(x.ndim) if d != channel_dim]
|
||||
xgt0 = x > 0
|
||||
proportion_positive = torch.mean(
|
||||
xgt0.to(x.dtype), dim=sum_dims, keepdim=True
|
||||
)
|
||||
factor1 = (
|
||||
(min_positive - proportion_positive).relu()
|
||||
* (max_factor / min_positive)
|
||||
if min_positive != 0.0
|
||||
else 0.0
|
||||
)
|
||||
factor2 = (
|
||||
(proportion_positive - max_positive).relu()
|
||||
* (max_factor / (max_positive - 1.0))
|
||||
if max_positive != 1.0
|
||||
else 0.0
|
||||
)
|
||||
factor = factor1 + factor2
|
||||
if isinstance(factor, float):
|
||||
factor = torch.zeros_like(proportion_positive)
|
||||
|
||||
mean_abs = torch.mean(x.abs(), dim=sum_dims, keepdim=True)
|
||||
below_threshold = mean_abs < min_abs
|
||||
above_threshold = mean_abs > max_abs
|
||||
|
||||
ctx.save_for_backward(
|
||||
factor, xgt0, below_threshold, above_threshold
|
||||
)
|
||||
ctx.max_factor = max_factor
|
||||
ctx.sum_dims = sum_dims
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def backward(
|
||||
ctx, x_grad: Tensor
|
||||
) -> Tuple[Tensor, None, None, None, None, None, None]:
|
||||
factor, xgt0, below_threshold, above_threshold = ctx.saved_tensors
|
||||
dtype = x_grad.dtype
|
||||
scale_factor = (
|
||||
(below_threshold.to(dtype) - above_threshold.to(dtype))
|
||||
* (xgt0.to(dtype) - 0.5)
|
||||
* (ctx.max_factor * 2.0)
|
||||
)
|
||||
|
||||
neg_delta_grad = x_grad.abs() * (factor + scale_factor)
|
||||
return x_grad - neg_delta_grad, None, None, None, None, None, None
|
||||
|
||||
|
||||
class BasicNorm(torch.nn.Module):
|
||||
"""
|
||||
This is intended to be a simpler, and hopefully cheaper, replacement for
|
||||
LayerNorm. The observation this is based on, is that Transformer-type
|
||||
networks, especially with pre-norm, sometimes seem to set one of the
|
||||
feature dimensions to a large constant value (e.g. 50), which "defeats"
|
||||
the LayerNorm because the output magnitude is then not strongly dependent
|
||||
on the other (useful) features. Presumably the weight and bias of the
|
||||
LayerNorm are required to allow it to do this.
|
||||
|
||||
So the idea is to introduce this large constant value as an explicit
|
||||
parameter, that takes the role of the "eps" in LayerNorm, so the network
|
||||
doesn't have to do this trick. We make the "eps" learnable.
|
||||
|
||||
Args:
|
||||
num_channels: the number of channels, e.g. 512.
|
||||
channel_dim: the axis/dimension corresponding to the channel,
|
||||
interprted as an offset from the input's ndim if negative.
|
||||
shis is NOT the num_channels; it should typically be one of
|
||||
{-2, -1, 0, 1, 2, 3}.
|
||||
eps: the initial "epsilon" that we add as ballast in:
|
||||
scale = ((input_vec**2).mean() + epsilon)**-0.5
|
||||
Note: our epsilon is actually large, but we keep the name
|
||||
to indicate the connection with conventional LayerNorm.
|
||||
learn_eps: if true, we learn epsilon; if false, we keep it
|
||||
at the initial value.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_channels: int,
|
||||
channel_dim: int = -1, # CAUTION: see documentation.
|
||||
eps: float = 0.25,
|
||||
learn_eps: bool = True,
|
||||
) -> None:
|
||||
super(BasicNorm, self).__init__()
|
||||
self.num_channels = num_channels
|
||||
self.channel_dim = channel_dim
|
||||
if learn_eps:
|
||||
self.eps = nn.Parameter(torch.tensor(eps).log().detach())
|
||||
else:
|
||||
self.register_buffer("eps", torch.tensor(eps).log().detach())
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
assert x.shape[self.channel_dim] == self.num_channels
|
||||
scales = (
|
||||
torch.mean(x ** 2, dim=self.channel_dim, keepdim=True)
|
||||
+ self.eps.exp()
|
||||
) ** -0.5
|
||||
return x * scales
|
||||
|
||||
|
||||
class ScaledLinear(nn.Linear):
|
||||
"""
|
||||
A modified version of nn.Linear that gives an easy way to set the
|
||||
default initial parameter scale.
|
||||
|
||||
Args:
|
||||
Accepts the standard args and kwargs that nn.Linear accepts
|
||||
e.g. in_features, out_features, bias=False.
|
||||
|
||||
initial_scale: you can override this if you want to increase
|
||||
or decrease the initial magnitude of the module's output
|
||||
(affects the initialization of weight_scale and bias_scale).
|
||||
Another option, if you want to do something like this, is
|
||||
to re-initialize the parameters.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
initial_scale: float = 1.0,
|
||||
**kwargs
|
||||
):
|
||||
super(ScaledLinear, self).__init__(*args, **kwargs)
|
||||
with torch.no_grad():
|
||||
self.weight[:] *= initial_scale
|
||||
if self.bias is not None:
|
||||
torch.nn.init.uniform_(self.bias,
|
||||
-0.1 * initial_scale,
|
||||
0.1 * initial_scale)
|
||||
|
||||
|
||||
|
||||
class ScaledConv1d(nn.Conv1d):
|
||||
# See docs for ScaledLinear
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
initial_scale: float = 1.0,
|
||||
**kwargs
|
||||
):
|
||||
super(ScaledConv1d, self).__init__(*args, **kwargs)
|
||||
with torch.no_grad():
|
||||
self.weight[:] *= initial_scale
|
||||
if self.bias is not None:
|
||||
torch.nn.init.uniform_(self.bias,
|
||||
-0.1 * initial_scale,
|
||||
0.1 * initial_scale)
|
||||
|
||||
def get_weight(self): # TODO: delete
|
||||
return self.weight
|
||||
|
||||
def get_bias(self): # TODO: delete
|
||||
return self.bias
|
||||
|
||||
|
||||
class ScaledConv2d(nn.Conv2d):
|
||||
# See docs for ScaledLinear
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
initial_scale: float = 1.0,
|
||||
**kwargs
|
||||
):
|
||||
super(ScaledConv2d, self).__init__(*args, **kwargs)
|
||||
with torch.no_grad():
|
||||
self.weight[:] *= initial_scale
|
||||
if self.bias is not None:
|
||||
torch.nn.init.uniform_(self.bias,
|
||||
-0.1 * initial_scale,
|
||||
0.1 * initial_scale)
|
||||
|
||||
def get_weight(self):
|
||||
return self.weight
|
||||
|
||||
def get_bias(self):
|
||||
return self.bias
|
||||
|
||||
|
||||
|
||||
class ActivationBalancer(torch.nn.Module):
|
||||
"""
|
||||
Modifies the backpropped derivatives of a function to try to encourage, for
|
||||
each channel, that it is positive at least a proportion `threshold` of the
|
||||
time. It does this by multiplying negative derivative values by up to
|
||||
(1+max_factor), and positive derivative values by up to (1-max_factor),
|
||||
interpolated from 1 at the threshold to those extremal values when none
|
||||
of the inputs are positive.
|
||||
|
||||
|
||||
Args:
|
||||
channel_dim: the dimension/axis corresponding to the channel, e.g.
|
||||
-1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
|
||||
min_positive: the minimum, per channel, of the proportion of the time
|
||||
that (x > 0), below which we start to modify the derivatives.
|
||||
max_positive: the maximum, per channel, of the proportion of the time
|
||||
that (x > 0), above which we start to modify the derivatives.
|
||||
max_factor: the maximum factor by which we modify the derivatives for
|
||||
either the sign constraint or the magnitude constraint;
|
||||
e.g. with max_factor=0.02, the the derivatives would be multiplied by
|
||||
values in the range [0.98..1.02].
|
||||
min_abs: the minimum average-absolute-value per channel, which
|
||||
we allow, before we start to modify the derivatives to prevent
|
||||
this.
|
||||
max_abs: the maximum average-absolute-value per channel, which
|
||||
we allow, before we start to modify the derivatives to prevent
|
||||
this.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
channel_dim: int,
|
||||
min_positive: float = 0.05,
|
||||
max_positive: float = 0.95,
|
||||
max_factor: float = 0.01,
|
||||
min_abs: float = 0.2,
|
||||
max_abs: float = 100.0,
|
||||
):
|
||||
super(ActivationBalancer, self).__init__()
|
||||
self.channel_dim = channel_dim
|
||||
self.min_positive = min_positive
|
||||
self.max_positive = max_positive
|
||||
self.max_factor = max_factor
|
||||
self.min_abs = min_abs
|
||||
self.max_abs = max_abs
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
if torch.jit.is_scripting():
|
||||
return x
|
||||
|
||||
return ActivationBalancerFunction.apply(
|
||||
x,
|
||||
self.channel_dim,
|
||||
self.min_positive,
|
||||
self.max_positive,
|
||||
self.max_factor,
|
||||
self.min_abs,
|
||||
self.max_abs,
|
||||
)
|
||||
|
||||
|
||||
class DoubleSwishFunction(torch.autograd.Function):
|
||||
"""
|
||||
double_swish(x) = x * torch.sigmoid(x-1)
|
||||
This is a definition, originally motivated by its close numerical
|
||||
similarity to swish(swish(x)), where swish(x) = x * sigmoid(x).
|
||||
|
||||
Memory-efficient derivative computation:
|
||||
double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
|
||||
double_swish'(x) = d/dx double_swish(x) = x * s'(x) + x' * s(x) = x * s'(x) + s(x).
|
||||
Now, s'(x) = s(x) * (1-s(x)).
|
||||
double_swish'(x) = x * s'(x) + s(x).
|
||||
= x * s(x) * (1-s(x)) + s(x).
|
||||
= double_swish(x) * (1-s(x)) + s(x)
|
||||
... so we just need to remember s(x) but not x itself.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x: Tensor) -> Tensor:
|
||||
x = x.detach()
|
||||
s = torch.sigmoid(x - 1.0)
|
||||
y = x * s
|
||||
ctx.save_for_backward(s, y)
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, y_grad: Tensor) -> Tensor:
|
||||
s, y = ctx.saved_tensors
|
||||
return (y * (1 - s) + s) * y_grad
|
||||
|
||||
|
||||
class DoubleSwish(torch.nn.Module):
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""Return double-swish activation function which is an approximation to Swish(Swish(x)),
|
||||
that we approximate closely with x * sigmoid(x-1).
|
||||
"""
|
||||
if torch.jit.is_scripting():
|
||||
return x * torch.sigmoid(x - 1.0)
|
||||
return DoubleSwishFunction.apply(x)
|
||||
|
||||
|
||||
|
||||
|
||||
def _test_activation_balancer_sign():
|
||||
probs = torch.arange(0, 1, 0.01)
|
||||
N = 1000
|
||||
x = 1.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))
|
||||
x = x.detach()
|
||||
x.requires_grad = True
|
||||
m = ActivationBalancer(
|
||||
channel_dim=0,
|
||||
min_positive=0.05,
|
||||
max_positive=0.95,
|
||||
max_factor=0.2,
|
||||
min_abs=0.0,
|
||||
)
|
||||
|
||||
y_grad = torch.sign(torch.randn(probs.numel(), N))
|
||||
|
||||
y = m(x)
|
||||
y.backward(gradient=y_grad)
|
||||
print("_test_activation_balancer_sign: x = ", x)
|
||||
print("_test_activation_balancer_sign: y grad = ", y_grad)
|
||||
print("_test_activation_balancer_sign: x grad = ", x.grad)
|
||||
|
||||
|
||||
def _test_activation_balancer_magnitude():
|
||||
magnitudes = torch.arange(0, 1, 0.01)
|
||||
N = 1000
|
||||
x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(
|
||||
-1
|
||||
)
|
||||
x = x.detach()
|
||||
x.requires_grad = True
|
||||
m = ActivationBalancer(
|
||||
channel_dim=0,
|
||||
min_positive=0.0,
|
||||
max_positive=1.0,
|
||||
max_factor=0.2,
|
||||
min_abs=0.2,
|
||||
max_abs=0.8,
|
||||
)
|
||||
|
||||
y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
|
||||
|
||||
y = m(x)
|
||||
y.backward(gradient=y_grad)
|
||||
print("_test_activation_balancer_magnitude: x = ", x)
|
||||
print("_test_activation_balancer_magnitude: y grad = ", y_grad)
|
||||
print("_test_activation_balancer_magnitude: x grad = ", x.grad)
|
||||
|
||||
|
||||
def _test_basic_norm():
|
||||
num_channels = 128
|
||||
m = BasicNorm(num_channels=num_channels, channel_dim=1)
|
||||
|
||||
x = torch.randn(500, num_channels)
|
||||
|
||||
y = m(x)
|
||||
|
||||
assert y.shape == x.shape
|
||||
x_rms = (x ** 2).mean().sqrt()
|
||||
y_rms = (y ** 2).mean().sqrt()
|
||||
print("x rms = ", x_rms)
|
||||
print("y rms = ", y_rms)
|
||||
assert y_rms < x_rms
|
||||
assert y_rms > 0.5 * x_rms
|
||||
|
||||
|
||||
def _test_double_swish_deriv():
|
||||
x = torch.randn(10, 12, dtype=torch.double) * 0.5
|
||||
x.requires_grad = True
|
||||
m = DoubleSwish()
|
||||
torch.autograd.gradcheck(m, x)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test_activation_balancer_sign()
|
||||
_test_activation_balancer_magnitude()
|
||||
_test_basic_norm()
|
||||
_test_double_swish_deriv()
|
1
egs/librispeech/ASR/pruned_transducer_stateless4/scaling.py
Symbolic link
1
egs/librispeech/ASR/pruned_transducer_stateless4/scaling.py
Symbolic link
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless2/scaling.py
|
@ -66,7 +66,7 @@ from lhotse.cut import Cut
|
||||
from lhotse.dataset.sampling.base import CutSampler
|
||||
from lhotse.utils import fix_random_seed
|
||||
from model import Transducer
|
||||
from optim import Eden, Cain
|
||||
from optim import Eden, Eve
|
||||
from torch import Tensor
|
||||
from torch.cuda.amp import GradScaler
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
@ -859,7 +859,7 @@ def run(rank, world_size, args):
|
||||
model_avg: Optional[nn.Module] = None
|
||||
if rank == 0:
|
||||
# model_avg is only used with rank 0
|
||||
model_avg = copy.deepcopy(model).to(torch.float64)
|
||||
model_avg = copy.deepcopy(model)
|
||||
|
||||
assert params.start_epoch > 0, params.start_epoch
|
||||
checkpoints = load_checkpoint_if_available(
|
||||
@ -871,7 +871,7 @@ def run(rank, world_size, args):
|
||||
logging.info("Using DDP")
|
||||
model = DDP(model, device_ids=[rank])
|
||||
|
||||
optimizer = Cain(model.parameters(), lr=params.initial_lr)
|
||||
optimizer = Eve(model.parameters(), lr=params.initial_lr)
|
||||
|
||||
scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user