mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Merge branch 'scaled_adam_exp273' into scaled_adam_exp277
# Conflicts: # egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
This commit is contained in:
commit
70300e34d3
@ -1434,10 +1434,12 @@ class ModifiedSEModule(nn.Module):
|
||||
"""
|
||||
def __init__(self,
|
||||
d_model: int,
|
||||
bottleneck_dim: int = 16):
|
||||
bottleneck_dim: int = 8):
|
||||
super().__init__()
|
||||
self.squeeze_proj = nn.Linear(d_model, d_model,
|
||||
self.squeeze_proj = nn.Linear(d_model, bottleneck_dim,
|
||||
bias=False)
|
||||
|
||||
|
||||
self.in_proj = nn.Linear(d_model, d_model,
|
||||
bias=False)
|
||||
|
||||
@ -1449,22 +1451,12 @@ class ModifiedSEModule(nn.Module):
|
||||
self.balancer = ActivationBalancer(
|
||||
d_model, channel_dim=-1,
|
||||
min_positive=0.05, max_positive=0.95,
|
||||
min_abs=0.1,
|
||||
max_abs=50.0,
|
||||
max_factor=0.01,
|
||||
max_factor=0.02,
|
||||
min_prob=0.2,
|
||||
)
|
||||
self.activation = DoubleSwish()
|
||||
self.to_bottleneck_proj = ScaledLinear(d_model, bottleneck_dim)
|
||||
|
||||
self.bottleneck_balancer = ActivationBalancer(
|
||||
bottleneck_dim, channel_dim=-1,
|
||||
min_positive=0.05, max_positive=0.95,
|
||||
max_abs=5.0,
|
||||
min_abs=0.5,
|
||||
max_factor=0.01,
|
||||
min_prob=0.2,
|
||||
)
|
||||
#self.bottleneck_norm = BasicNorm(bottleneck_dim)
|
||||
|
||||
self.from_bottleneck_proj = ScaledLinear(bottleneck_dim, d_model)
|
||||
|
||||
@ -1503,14 +1495,8 @@ class ModifiedSEModule(nn.Module):
|
||||
squeezed = self.squeeze_proj(squeezed)
|
||||
squeezed = self.balancer(squeezed)
|
||||
squeezed = self.activation(squeezed)
|
||||
squeezed = self.to_bottleneck_proj(squeezed)
|
||||
squeezed = self.bottleneck_balancer(squeezed)
|
||||
#squeezed = self.bottleneck_norm(squeezed)
|
||||
squeezed = self.from_bottleneck_proj(squeezed)
|
||||
if random.random() < 0.05:
|
||||
# to stop a hopefully-unlikely failure mode where the inputs to the sigmoid
|
||||
# get too large and the grads get mostly too small.
|
||||
squeezed = penalize_abs_values_gt(squeezed, limit=10.0, penalty=1.0e-04)
|
||||
|
||||
|
||||
x = self.in_proj(x)
|
||||
x = x * squeezed
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user