mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
streaming_decode.py, relax the audio range from [-1,+1] to [-10,+10] (#1448)
- some AudioTransform classes produce audio signals out of range [-1,+1] - Resample produced 1.0079 - The range [-10,+10] was chosen to still be able to reliably distinguish from the [-32k,+32k] signal... - this is related to : https://github.com/lhotse-speech/lhotse/issues/1254
This commit is contained in:
parent
8136ad775b
commit
716b82cc3a
@ -342,7 +342,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -597,12 +597,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
if audio.max() > 1:
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
logging.warning(
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}."
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
f"Clipping to [-1, 1]."
|
assert (
|
||||||
)
|
np.abs(audio).max() <= 10
|
||||||
audio = np.clip(audio, -1, 1)
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -362,7 +362,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -578,7 +578,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -681,8 +681,14 @@ def decode_dataset(
|
|||||||
assert len(audio.shape) == 2
|
assert len(audio.shape) == 2
|
||||||
assert audio.shape[0] == 1, "Should be single channel"
|
assert audio.shape[0] == 1, "Should be single channel"
|
||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
feature = fbank(samples)
|
feature = fbank(samples)
|
||||||
|
@ -681,8 +681,14 @@ def decode_dataset(
|
|||||||
assert len(audio.shape) == 2
|
assert len(audio.shape) == 2
|
||||||
assert audio.shape[0] == 1, "Should be single channel"
|
assert audio.shape[0] == 1, "Should be single channel"
|
||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
feature = fbank(samples)
|
feature = fbank(samples)
|
||||||
|
@ -673,8 +673,14 @@ def decode_dataset(
|
|||||||
assert len(audio.shape) == 2
|
assert len(audio.shape) == 2
|
||||||
assert audio.shape[0] == 1, "Should be single channel"
|
assert audio.shape[0] == 1, "Should be single channel"
|
||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
feature = fbank(samples)
|
feature = fbank(samples)
|
||||||
|
@ -673,8 +673,14 @@ def decode_dataset(
|
|||||||
assert len(audio.shape) == 2
|
assert len(audio.shape) == 2
|
||||||
assert audio.shape[0] == 1, "Should be single channel"
|
assert audio.shape[0] == 1, "Should be single channel"
|
||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
feature = fbank(samples)
|
feature = fbank(samples)
|
||||||
|
@ -359,7 +359,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -361,7 +361,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -362,7 +362,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -378,7 +378,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -378,7 +378,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -345,7 +345,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -345,7 +345,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -577,7 +577,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
@ -402,6 +402,14 @@ def decode_dataset(
|
|||||||
assert audio.shape[0] == 1, "Should be single channel"
|
assert audio.shape[0] == 1, "Should be single channel"
|
||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
|
# The trained model is using normalized samples
|
||||||
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
|
assert (
|
||||||
|
np.abs(audio).max() <= 10
|
||||||
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
fbank = Fbank(opts)
|
fbank = Fbank(opts)
|
||||||
|
@ -597,12 +597,12 @@ def decode_dataset(
|
|||||||
assert audio.dtype == np.float32, audio.dtype
|
assert audio.dtype == np.float32, audio.dtype
|
||||||
|
|
||||||
# The trained model is using normalized samples
|
# The trained model is using normalized samples
|
||||||
if audio.max() > 1:
|
# - this is to avoid sending [-32k,+32k] signal in...
|
||||||
logging.warning(
|
# - some lhotse AudioTransform classes can make the signal
|
||||||
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}."
|
# be out of range [-1, 1], hence the tolerance 10
|
||||||
f"Clipping to [-1, 1]."
|
assert (
|
||||||
)
|
np.abs(audio).max() <= 10
|
||||||
audio = np.clip(audio, -1, 1)
|
), "Should be normalized to [-1, 1], 10 for tolerance..."
|
||||||
|
|
||||||
samples = torch.from_numpy(audio).squeeze(0)
|
samples = torch.from_numpy(audio).squeeze(0)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user