diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
index be6f94412..d4c288545 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
@@ -922,7 +922,8 @@ class DoubleSwishFunction(torch.autograd.Function):
         if requires_grad:
             # discretize s.  This should be expectation-preserving if we just divide the
             # result by 255.
-            s = ((s * 255) + torch.rand_like(s)).to(torch.uint8)
+            s = s.to(torch.float)
+            s = ((s * 254.99) + torch.rand_like(s)).to(torch.uint8)
             ctx.save_for_backward(s, y)
         return y