mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
streaming_decode.py, relax the audio range from [-1,+1] to [-10,+10] (#1448)
- some AudioTransform classes produce audio signals out of range [-1,+1] - Resample produced 1.0079 - The range [-10,+10] was chosen to still be able to reliably distinguish from the [-32k,+32k] signal... - this is related to : https://github.com/lhotse-speech/lhotse/issues/1254
This commit is contained in:
parent
8136ad775b
commit
716b82cc3a
@ -342,7 +342,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -597,12 +597,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
if audio.max() > 1:
|
||||
logging.warning(
|
||||
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}."
|
||||
f"Clipping to [-1, 1]."
|
||||
)
|
||||
audio = np.clip(audio, -1, 1)
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -362,7 +362,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -578,7 +578,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -681,8 +681,14 @@ def decode_dataset(
|
||||
assert len(audio.shape) == 2
|
||||
assert audio.shape[0] == 1, "Should be single channel"
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
feature = fbank(samples)
|
||||
|
@ -681,8 +681,14 @@ def decode_dataset(
|
||||
assert len(audio.shape) == 2
|
||||
assert audio.shape[0] == 1, "Should be single channel"
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
feature = fbank(samples)
|
||||
|
@ -673,8 +673,14 @@ def decode_dataset(
|
||||
assert len(audio.shape) == 2
|
||||
assert audio.shape[0] == 1, "Should be single channel"
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
feature = fbank(samples)
|
||||
|
@ -673,8 +673,14 @@ def decode_dataset(
|
||||
assert len(audio.shape) == 2
|
||||
assert audio.shape[0] == 1, "Should be single channel"
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
feature = fbank(samples)
|
||||
|
@ -359,7 +359,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -361,7 +361,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -362,7 +362,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -378,7 +378,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -378,7 +378,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -345,7 +345,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -345,7 +345,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -577,7 +577,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
@ -402,6 +402,14 @@ def decode_dataset(
|
||||
assert audio.shape[0] == 1, "Should be single channel"
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
fbank = Fbank(opts)
|
||||
|
@ -597,12 +597,12 @@ def decode_dataset(
|
||||
assert audio.dtype == np.float32, audio.dtype
|
||||
|
||||
# The trained model is using normalized samples
|
||||
if audio.max() > 1:
|
||||
logging.warning(
|
||||
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}."
|
||||
f"Clipping to [-1, 1]."
|
||||
)
|
||||
audio = np.clip(audio, -1, 1)
|
||||
# - this is to avoid sending [-32k,+32k] signal in...
|
||||
# - some lhotse AudioTransform classes can make the signal
|
||||
# be out of range [-1, 1], hence the tolerance 10
|
||||
assert (
|
||||
np.abs(audio).max() <= 10
|
||||
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||
|
||||
samples = torch.from_numpy(audio).squeeze(0)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user