streaming_decode.py, relax the audio range from [-1,+1] to [-10,+10] (#1448)

- some AudioTransform classes produce audio signals out of range [-1,+1]
   - Resample produced 1.0079
   - The range [-10,+10] was chosen to still be able to reliably
     distinguish from the [-32k,+32k] signal...
- this is related to : https://github.com/lhotse-speech/lhotse/issues/1254
This commit is contained in:
Karel Vesely 2024-01-05 03:21:27 +01:00 committed by GitHub
parent 8136ad775b
commit 716b82cc3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 114 additions and 27 deletions

View File

@ -342,7 +342,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -597,12 +597,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
if audio.max() > 1: # - this is to avoid sending [-32k,+32k] signal in...
logging.warning( # - some lhotse AudioTransform classes can make the signal
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}." # be out of range [-1, 1], hence the tolerance 10
f"Clipping to [-1, 1]." assert (
) np.abs(audio).max() <= 10
audio = np.clip(audio, -1, 1) ), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -362,7 +362,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -578,7 +578,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -681,8 +681,14 @@ def decode_dataset(
assert len(audio.shape) == 2 assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel" assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples) feature = fbank(samples)

View File

@ -681,8 +681,14 @@ def decode_dataset(
assert len(audio.shape) == 2 assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel" assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples) feature = fbank(samples)

View File

@ -673,8 +673,14 @@ def decode_dataset(
assert len(audio.shape) == 2 assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel" assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples) feature = fbank(samples)

View File

@ -673,8 +673,14 @@ def decode_dataset(
assert len(audio.shape) == 2 assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel" assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples) feature = fbank(samples)

View File

@ -359,7 +359,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -361,7 +361,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -362,7 +362,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -378,7 +378,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -378,7 +378,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -345,7 +345,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -345,7 +345,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -577,7 +577,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])" # - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)

View File

@ -402,6 +402,14 @@ def decode_dataset(
assert audio.shape[0] == 1, "Should be single channel" assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert (
np.abs(audio).max() <= 10
), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)
fbank = Fbank(opts) fbank = Fbank(opts)

View File

@ -597,12 +597,12 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype assert audio.dtype == np.float32, audio.dtype
# The trained model is using normalized samples # The trained model is using normalized samples
if audio.max() > 1: # - this is to avoid sending [-32k,+32k] signal in...
logging.warning( # - some lhotse AudioTransform classes can make the signal
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}." # be out of range [-1, 1], hence the tolerance 10
f"Clipping to [-1, 1]." assert (
) np.abs(audio).max() <= 10
audio = np.clip(audio, -1, 1) ), "Should be normalized to [-1, 1], 10 for tolerance..."
samples = torch.from_numpy(audio).squeeze(0) samples = torch.from_numpy(audio).squeeze(0)