diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py index f7e3677da..00e906691 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py @@ -354,12 +354,12 @@ def decode_one_batch( supervisions = batch["supervisions"] feature_lens = supervisions["num_frames"].to(device) - feature_lens += params.left_context - feature = torch.nn.functional.pad( - feature, - pad=(0, 0, 0, params.left_context), - value=LOG_EPS, - ) + # feature_lens += params.left_context + # feature = torch.nn.functional.pad( + # feature, + # pad=(0, 0, 0, params.left_context), + # value=LOG_EPS, + # ) encoder_out, encoder_out_lens = model.encoder( x=feature, x_lens=feature_lens @@ -668,11 +668,6 @@ def main(): params.unk_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() - if params.simulate_streaming: - assert ( - params.causal_convolution - ), "Decoding in streaming requires causal convolution" - logging.info(params) logging.info("About to create model") diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py index 65c71ab2e..5f0785d91 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py @@ -409,8 +409,7 @@ class ScaledLSTM(nn.LSTM): def _reset_parameters(self, initial_speed: float): std = 0.1 / initial_speed a = (3 ** 0.5) * std - fan_in = self.input_size - scale = fan_in ** -0.5 + scale = self.hidden_size ** -0.5 v = scale / std for idx, name in enumerate(self._flat_weights_names): if "weight" in name: