mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Support longer input for the offline model
This commit is contained in:
parent
acbc4b5808
commit
4cb2395186
@ -74,7 +74,7 @@ from decoder import Decoder
|
|||||||
from onnxruntime.quantization import QuantType, quantize_dynamic
|
from onnxruntime.quantization import QuantType, quantize_dynamic
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
from zipformer import Zipformer2
|
from zipformer import Zipformer2, CompactRelPositionalEncoding
|
||||||
|
|
||||||
from icefall.checkpoint import (
|
from icefall.checkpoint import (
|
||||||
average_checkpoints,
|
average_checkpoints,
|
||||||
@ -296,6 +296,19 @@ def export_encoder_model_onnx(
|
|||||||
x = torch.zeros(1, 100, 80, dtype=torch.float32)
|
x = torch.zeros(1, 100, 80, dtype=torch.float32)
|
||||||
x_lens = torch.tensor([100], dtype=torch.int64)
|
x_lens = torch.tensor([100], dtype=torch.int64)
|
||||||
|
|
||||||
|
# It assumes that the maximum input, after downsampling, won't have more
|
||||||
|
# than 10k frames.
|
||||||
|
# The first downsampling factor is 2, so the maximum input
|
||||||
|
# should contain less than 20k frames, e.g., less than 400 seconds,
|
||||||
|
# i.e., 3.3 minutes
|
||||||
|
#
|
||||||
|
# Note: If you want to handle a longer input audio, please increase this
|
||||||
|
# value. The downside is that it will increase the size of the model.
|
||||||
|
max_len = 10000
|
||||||
|
for name, m in encoder_model.named_modules():
|
||||||
|
if isinstance(m, CompactRelPositionalEncoding):
|
||||||
|
m.extend_pe(torch.tensor(0.0).expand(max_len))
|
||||||
|
|
||||||
torch.onnx.export(
|
torch.onnx.export(
|
||||||
encoder_model,
|
encoder_model,
|
||||||
(x, x_lens),
|
(x, x_lens),
|
||||||
|
|||||||
@ -1305,12 +1305,6 @@ class CompactRelPositionalEncoding(torch.nn.Module):
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""Construct a CompactRelPositionalEncoding object."""
|
"""Construct a CompactRelPositionalEncoding object."""
|
||||||
super(CompactRelPositionalEncoding, self).__init__()
|
super(CompactRelPositionalEncoding, self).__init__()
|
||||||
if torch.jit.is_tracing:
|
|
||||||
# 10k frames correspond to ~100k ms, e.g., 100 seconds, i.e.,
|
|
||||||
# It assumes that the maximum input won't have more than
|
|
||||||
# 10k frames.
|
|
||||||
#
|
|
||||||
max_len = 10000
|
|
||||||
self.embed_dim = embed_dim
|
self.embed_dim = embed_dim
|
||||||
assert embed_dim % 2 == 0
|
assert embed_dim % 2 == 0
|
||||||
self.dropout = Dropout2(dropout_rate)
|
self.dropout = Dropout2(dropout_rate)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user