From 5ae80dfca7dab8f6faf99237effc17624f4e15de Mon Sep 17 00:00:00 2001 From: Wei Kang Date: Thu, 27 Jan 2022 18:01:17 +0800 Subject: [PATCH] Minor fixes (#193) --- egs/aishell/ASR/transducer_stateless/decoder.py | 16 ++++++++-------- egs/aishell/ASR/transducer_stateless/export.py | 9 +++++---- .../ASR/transducer_stateless/pretrained.py | 17 +++++++++-------- egs/aishell/ASR/transducer_stateless/train.py | 8 ++++---- .../ASR/transducer_stateless/decoder.py | 16 ++++++++-------- .../ASR/transducer_stateless/export.py | 9 +++++---- .../ASR/transducer_stateless/pretrained.py | 9 +++++---- .../ASR/transducer_stateless/train.py | 8 ++++---- 8 files changed, 48 insertions(+), 44 deletions(-) diff --git a/egs/aishell/ASR/transducer_stateless/decoder.py b/egs/aishell/ASR/transducer_stateless/decoder.py index dca084477..c2c6552a9 100644 --- a/egs/aishell/ASR/transducer_stateless/decoder.py +++ b/egs/aishell/ASR/transducer_stateless/decoder.py @@ -82,17 +82,17 @@ class Decoder(nn.Module): Returns: Return a tensor of shape (N, U, embedding_dim). """ - embeding_out = self.embedding(y) + embedding_out = self.embedding(y) if self.context_size > 1: - embeding_out = embeding_out.permute(0, 2, 1) + embedding_out = embedding_out.permute(0, 2, 1) if need_pad is True: - embeding_out = F.pad( - embeding_out, pad=(self.context_size - 1, 0) + embedding_out = F.pad( + embedding_out, pad=(self.context_size - 1, 0) ) else: # During inference time, there is no need to do extra padding # as we only need one output - assert embeding_out.size(-1) == self.context_size - embeding_out = self.conv(embeding_out) - embeding_out = embeding_out.permute(0, 2, 1) - return embeding_out + assert embedding_out.size(-1) == self.context_size + embedding_out = self.conv(embedding_out) + embedding_out = embedding_out.permute(0, 2, 1) + return embedding_out diff --git a/egs/aishell/ASR/transducer_stateless/export.py b/egs/aishell/ASR/transducer_stateless/export.py index 641555bdb..5687260df 100755 --- a/egs/aishell/ASR/transducer_stateless/export.py +++ b/egs/aishell/ASR/transducer_stateless/export.py @@ -48,6 +48,7 @@ from pathlib import Path import sentencepiece as spm import torch +import torch.nn as nn from conformer import Conformer from decoder import Decoder from joiner import Joiner @@ -133,7 +134,7 @@ def get_params() -> AttributeDict: return params -def get_encoder_model(params: AttributeDict): +def get_encoder_model(params: AttributeDict) -> nn.Module: encoder = Conformer( num_features=params.feature_dim, output_dim=params.encoder_out_dim, @@ -147,7 +148,7 @@ def get_encoder_model(params: AttributeDict): return encoder -def get_decoder_model(params: AttributeDict): +def get_decoder_model(params: AttributeDict) -> nn.Module: decoder = Decoder( vocab_size=params.vocab_size, embedding_dim=params.encoder_out_dim, @@ -157,7 +158,7 @@ def get_decoder_model(params: AttributeDict): return decoder -def get_joiner_model(params: AttributeDict): +def get_joiner_model(params: AttributeDict) -> nn.Module: joiner = Joiner( input_dim=params.encoder_out_dim, output_dim=params.vocab_size, @@ -165,7 +166,7 @@ def get_joiner_model(params: AttributeDict): return joiner -def get_transducer_model(params: AttributeDict): +def get_transducer_model(params: AttributeDict) -> nn.Module: encoder = get_encoder_model(params) decoder = get_decoder_model(params) joiner = get_joiner_model(params) diff --git a/egs/aishell/ASR/transducer_stateless/pretrained.py b/egs/aishell/ASR/transducer_stateless/pretrained.py index 65ac5f3ff..db89c4d67 100755 --- a/egs/aishell/ASR/transducer_stateless/pretrained.py +++ b/egs/aishell/ASR/transducer_stateless/pretrained.py @@ -44,11 +44,12 @@ Note: ./transducer_stateless/exp/pretrained.pt is generated by import argparse import logging import math -from typing import List from pathlib import Path +from typing import List import kaldifeat import torch +import torch.nn as nn import torchaudio from beam_search import beam_search, greedy_search from conformer import Conformer @@ -57,10 +58,10 @@ from joiner import Joiner from model import Transducer from torch.nn.utils.rnn import pad_sequence -from icefall.env import get_env_info -from icefall.utils import AttributeDict -from icefall.lexicon import Lexicon from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler +from icefall.env import get_env_info +from icefall.lexicon import Lexicon +from icefall.utils import AttributeDict def get_parser(): @@ -150,7 +151,7 @@ def get_params() -> AttributeDict: return params -def get_encoder_model(params: AttributeDict): +def get_encoder_model(params: AttributeDict) -> nn.Module: encoder = Conformer( num_features=params.feature_dim, output_dim=params.encoder_out_dim, @@ -164,7 +165,7 @@ def get_encoder_model(params: AttributeDict): return encoder -def get_decoder_model(params: AttributeDict): +def get_decoder_model(params: AttributeDict) -> nn.Module: decoder = Decoder( vocab_size=params.vocab_size, embedding_dim=params.encoder_out_dim, @@ -174,7 +175,7 @@ def get_decoder_model(params: AttributeDict): return decoder -def get_joiner_model(params: AttributeDict): +def get_joiner_model(params: AttributeDict) -> nn.Module: joiner = Joiner( input_dim=params.encoder_out_dim, output_dim=params.vocab_size, @@ -182,7 +183,7 @@ def get_joiner_model(params: AttributeDict): return joiner -def get_transducer_model(params: AttributeDict): +def get_transducer_model(params: AttributeDict) -> nn.Module: encoder = get_encoder_model(params) decoder = get_decoder_model(params) joiner = get_joiner_model(params) diff --git a/egs/aishell/ASR/transducer_stateless/train.py b/egs/aishell/ASR/transducer_stateless/train.py index 7da8e28a1..0c180b260 100755 --- a/egs/aishell/ASR/transducer_stateless/train.py +++ b/egs/aishell/ASR/transducer_stateless/train.py @@ -204,7 +204,7 @@ def get_params() -> AttributeDict: return params -def get_encoder_model(params: AttributeDict): +def get_encoder_model(params: AttributeDict) -> nn.Module: # TODO: We can add an option to switch between Conformer and Transformer encoder = Conformer( num_features=params.feature_dim, @@ -219,7 +219,7 @@ def get_encoder_model(params: AttributeDict): return encoder -def get_decoder_model(params: AttributeDict): +def get_decoder_model(params: AttributeDict) -> nn.Module: decoder = Decoder( vocab_size=params.vocab_size, embedding_dim=params.encoder_out_dim, @@ -229,7 +229,7 @@ def get_decoder_model(params: AttributeDict): return decoder -def get_joiner_model(params: AttributeDict): +def get_joiner_model(params: AttributeDict) -> nn.Module: joiner = Joiner( input_dim=params.encoder_out_dim, output_dim=params.vocab_size, @@ -237,7 +237,7 @@ def get_joiner_model(params: AttributeDict): return joiner -def get_transducer_model(params: AttributeDict): +def get_transducer_model(params: AttributeDict) -> nn.Module: encoder = get_encoder_model(params) decoder = get_decoder_model(params) joiner = get_joiner_model(params) diff --git a/egs/librispeech/ASR/transducer_stateless/decoder.py b/egs/librispeech/ASR/transducer_stateless/decoder.py index dca084477..c2c6552a9 100644 --- a/egs/librispeech/ASR/transducer_stateless/decoder.py +++ b/egs/librispeech/ASR/transducer_stateless/decoder.py @@ -82,17 +82,17 @@ class Decoder(nn.Module): Returns: Return a tensor of shape (N, U, embedding_dim). """ - embeding_out = self.embedding(y) + embedding_out = self.embedding(y) if self.context_size > 1: - embeding_out = embeding_out.permute(0, 2, 1) + embedding_out = embedding_out.permute(0, 2, 1) if need_pad is True: - embeding_out = F.pad( - embeding_out, pad=(self.context_size - 1, 0) + embedding_out = F.pad( + embedding_out, pad=(self.context_size - 1, 0) ) else: # During inference time, there is no need to do extra padding # as we only need one output - assert embeding_out.size(-1) == self.context_size - embeding_out = self.conv(embeding_out) - embeding_out = embeding_out.permute(0, 2, 1) - return embeding_out + assert embedding_out.size(-1) == self.context_size + embedding_out = self.conv(embedding_out) + embedding_out = embedding_out.permute(0, 2, 1) + return embedding_out diff --git a/egs/librispeech/ASR/transducer_stateless/export.py b/egs/librispeech/ASR/transducer_stateless/export.py index 641555bdb..5687260df 100755 --- a/egs/librispeech/ASR/transducer_stateless/export.py +++ b/egs/librispeech/ASR/transducer_stateless/export.py @@ -48,6 +48,7 @@ from pathlib import Path import sentencepiece as spm import torch +import torch.nn as nn from conformer import Conformer from decoder import Decoder from joiner import Joiner @@ -133,7 +134,7 @@ def get_params() -> AttributeDict: return params -def get_encoder_model(params: AttributeDict): +def get_encoder_model(params: AttributeDict) -> nn.Module: encoder = Conformer( num_features=params.feature_dim, output_dim=params.encoder_out_dim, @@ -147,7 +148,7 @@ def get_encoder_model(params: AttributeDict): return encoder -def get_decoder_model(params: AttributeDict): +def get_decoder_model(params: AttributeDict) -> nn.Module: decoder = Decoder( vocab_size=params.vocab_size, embedding_dim=params.encoder_out_dim, @@ -157,7 +158,7 @@ def get_decoder_model(params: AttributeDict): return decoder -def get_joiner_model(params: AttributeDict): +def get_joiner_model(params: AttributeDict) -> nn.Module: joiner = Joiner( input_dim=params.encoder_out_dim, output_dim=params.vocab_size, @@ -165,7 +166,7 @@ def get_joiner_model(params: AttributeDict): return joiner -def get_transducer_model(params: AttributeDict): +def get_transducer_model(params: AttributeDict) -> nn.Module: encoder = get_encoder_model(params) decoder = get_decoder_model(params) joiner = get_joiner_model(params) diff --git a/egs/librispeech/ASR/transducer_stateless/pretrained.py b/egs/librispeech/ASR/transducer_stateless/pretrained.py index e5dba8f0e..c248de777 100755 --- a/egs/librispeech/ASR/transducer_stateless/pretrained.py +++ b/egs/librispeech/ASR/transducer_stateless/pretrained.py @@ -49,6 +49,7 @@ from typing import List import kaldifeat import sentencepiece as spm import torch +import torch.nn as nn import torchaudio from beam_search import beam_search, greedy_search from conformer import Conformer @@ -148,7 +149,7 @@ def get_params() -> AttributeDict: return params -def get_encoder_model(params: AttributeDict): +def get_encoder_model(params: AttributeDict) -> nn.Module: encoder = Conformer( num_features=params.feature_dim, output_dim=params.encoder_out_dim, @@ -162,7 +163,7 @@ def get_encoder_model(params: AttributeDict): return encoder -def get_decoder_model(params: AttributeDict): +def get_decoder_model(params: AttributeDict) -> nn.Module: decoder = Decoder( vocab_size=params.vocab_size, embedding_dim=params.encoder_out_dim, @@ -172,7 +173,7 @@ def get_decoder_model(params: AttributeDict): return decoder -def get_joiner_model(params: AttributeDict): +def get_joiner_model(params: AttributeDict) -> nn.Module: joiner = Joiner( input_dim=params.encoder_out_dim, output_dim=params.vocab_size, @@ -180,7 +181,7 @@ def get_joiner_model(params: AttributeDict): return joiner -def get_transducer_model(params: AttributeDict): +def get_transducer_model(params: AttributeDict) -> nn.Module: encoder = get_encoder_model(params) decoder = get_decoder_model(params) joiner = get_joiner_model(params) diff --git a/egs/librispeech/ASR/transducer_stateless/train.py b/egs/librispeech/ASR/transducer_stateless/train.py index 694ebf1d5..950a88a35 100755 --- a/egs/librispeech/ASR/transducer_stateless/train.py +++ b/egs/librispeech/ASR/transducer_stateless/train.py @@ -213,7 +213,7 @@ def get_params() -> AttributeDict: return params -def get_encoder_model(params: AttributeDict): +def get_encoder_model(params: AttributeDict) -> nn.Module: # TODO: We can add an option to switch between Conformer and Transformer encoder = Conformer( num_features=params.feature_dim, @@ -228,7 +228,7 @@ def get_encoder_model(params: AttributeDict): return encoder -def get_decoder_model(params: AttributeDict): +def get_decoder_model(params: AttributeDict) -> nn.Module: decoder = Decoder( vocab_size=params.vocab_size, embedding_dim=params.encoder_out_dim, @@ -238,7 +238,7 @@ def get_decoder_model(params: AttributeDict): return decoder -def get_joiner_model(params: AttributeDict): +def get_joiner_model(params: AttributeDict) -> nn.Module: joiner = Joiner( input_dim=params.encoder_out_dim, output_dim=params.vocab_size, @@ -246,7 +246,7 @@ def get_joiner_model(params: AttributeDict): return joiner -def get_transducer_model(params: AttributeDict): +def get_transducer_model(params: AttributeDict) -> nn.Module: encoder = get_encoder_model(params) decoder = get_decoder_model(params) joiner = get_joiner_model(params)