diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py index d59aa2160..f89d2963e 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py @@ -367,7 +367,7 @@ class ActivationBalancer(torch.nn.Module): min_positive: the minimum, per channel, of the proportion of the time that (x > 0), below which we start to modify the derivatives. max_positive: the maximum, per channel, of the proportion of the time - that (x > 0), below which we start to modify the derivatives. + that (x > 0), above which we start to modify the derivatives. max_factor: the maximum factor by which we modify the derivatives for either the sign constraint or the magnitude constraint; e.g. with max_factor=0.02, the the derivatives would be multiplied by @@ -413,7 +413,7 @@ class DoubleSwishFunction(torch.autograd.Function): """ double_swish(x) = x * torch.sigmoid(x-1) This is a definition, originally motivated by its close numerical - similarity to swish(swish(x), where swish(x) = x * sigmoid(x). + similarity to swish(swish(x)), where swish(x) = x * sigmoid(x). Memory-efficient derivative computation: double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1) diff --git a/icefall/diagnostics.py b/icefall/diagnostics.py index ce4ac1464..bc8fe3069 100644 --- a/icefall/diagnostics.py +++ b/icefall/diagnostics.py @@ -111,7 +111,7 @@ def get_diagnostics_for_dim( options object sizes_same: True if all the tensor sizes are the same on this dimension - stats_type: either "abs" or "positive" or "eigs" or "value", + stats_type: either "abs" or "positive" or "eigs" or "value", imdictates the type of stats we accumulate, abs is mean absolute value, "positive" is proportion of positive to nonnegative values, "eigs" is eigenvalues after doing outer product on this dim, sum