From 9d0d5d19fb3ead2ac07d93abd6100ec5e123a756 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 18 Dec 2021 11:22:12 +0800 Subject: [PATCH] Remove sos ID. --- egs/librispeech/ASR/transducer/beam_search.py | 3 +-- egs/librispeech/ASR/transducer/decode.py | 2 -- egs/librispeech/ASR/transducer/decoder.py | 4 ---- egs/librispeech/ASR/transducer/export.py | 2 -- egs/librispeech/ASR/transducer/model.py | 6 ++---- egs/librispeech/ASR/transducer/pretrained.py | 2 -- egs/librispeech/ASR/transducer/test_conformer.py | 1 - egs/librispeech/ASR/transducer/test_decoder.py | 2 -- egs/librispeech/ASR/transducer/test_transducer.py | 3 --- egs/librispeech/ASR/transducer/test_transformer.py | 1 - egs/librispeech/ASR/transducer/train.py | 2 -- 11 files changed, 3 insertions(+), 25 deletions(-) diff --git a/egs/librispeech/ASR/transducer/beam_search.py b/egs/librispeech/ASR/transducer/beam_search.py index dfc22fcf8..f45d06ce9 100644 --- a/egs/librispeech/ASR/transducer/beam_search.py +++ b/egs/librispeech/ASR/transducer/beam_search.py @@ -111,7 +111,6 @@ def beam_search( # support only batch_size == 1 for now assert encoder_out.size(0) == 1, encoder_out.size(0) blank_id = model.decoder.blank_id - sos_id = model.decoder.sos_id device = model.device sos = torch.tensor([blank_id], device=device).reshape(1, 1) @@ -192,7 +191,7 @@ def beam_search( # Second, choose other labels for i, v in enumerate(log_prob.tolist()): - if i in (blank_id, sos_id): + if i == blank_id: continue new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v diff --git a/egs/librispeech/ASR/transducer/decode.py b/egs/librispeech/ASR/transducer/decode.py index 93f7d8ec3..eeceb6db2 100755 --- a/egs/librispeech/ASR/transducer/decode.py +++ b/egs/librispeech/ASR/transducer/decode.py @@ -159,7 +159,6 @@ def get_decoder_model(params: AttributeDict): vocab_size=params.vocab_size, embedding_dim=params.decoder_embedding_dim, blank_id=params.blank_id, - sos_id=params.sos_id, num_layers=params.num_decoder_layers, hidden_dim=params.decoder_hidden_dim, output_dim=params.encoder_out_dim, @@ -399,7 +398,6 @@ def main(): # and are defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") - params.sos_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() logging.info(params) diff --git a/egs/librispeech/ASR/transducer/decoder.py b/egs/librispeech/ASR/transducer/decoder.py index 2f6bf4c07..7b529ac19 100644 --- a/egs/librispeech/ASR/transducer/decoder.py +++ b/egs/librispeech/ASR/transducer/decoder.py @@ -27,7 +27,6 @@ class Decoder(nn.Module): vocab_size: int, embedding_dim: int, blank_id: int, - sos_id: int, num_layers: int, hidden_dim: int, output_dim: int, @@ -42,8 +41,6 @@ class Decoder(nn.Module): Dimension of the input embedding. blank_id: The ID of the blank symbol. - sos_id: - The ID of the SOS symbol. num_layers: Number of LSTM layers. hidden_dim: @@ -71,7 +68,6 @@ class Decoder(nn.Module): dropout=rnn_dropout, ) self.blank_id = blank_id - self.sos_id = sos_id self.output_linear = nn.Linear(hidden_dim, output_dim) def forward( diff --git a/egs/librispeech/ASR/transducer/export.py b/egs/librispeech/ASR/transducer/export.py index 785bc1986..819d2e31d 100755 --- a/egs/librispeech/ASR/transducer/export.py +++ b/egs/librispeech/ASR/transducer/export.py @@ -148,7 +148,6 @@ def get_decoder_model(params: AttributeDict): vocab_size=params.vocab_size, embedding_dim=params.decoder_embedding_dim, blank_id=params.blank_id, - sos_id=params.sos_id, num_layers=params.num_decoder_layers, hidden_dim=params.decoder_hidden_dim, output_dim=params.encoder_out_dim, @@ -197,7 +196,6 @@ def main(): # and are defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") - params.sos_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() logging.info(params) diff --git a/egs/librispeech/ASR/transducer/model.py b/egs/librispeech/ASR/transducer/model.py index 8a4d3ca69..765ed5de8 100644 --- a/egs/librispeech/ASR/transducer/model.py +++ b/egs/librispeech/ASR/transducer/model.py @@ -54,7 +54,7 @@ class Transducer(nn.Module): decoder: It is the prediction network in the paper. Its input shape is (N, U) and its output shape is (N, U, C). It should contain - two attributes: `blank_id` and `sos_id`. + one attribute: `blank_id`. joiner: It has two inputs with shapes: (N, T, C) and (N, U, C). Its output shape is (N, T, U, C). Note that its output contains @@ -63,7 +63,6 @@ class Transducer(nn.Module): super().__init__() assert isinstance(encoder, EncoderInterface) assert hasattr(decoder, "blank_id") - assert hasattr(decoder, "sos_id") self.encoder = encoder self.decoder = decoder @@ -102,8 +101,7 @@ class Transducer(nn.Module): y_lens = row_splits[1:] - row_splits[:-1] blank_id = self.decoder.blank_id - sos_id = self.decoder.sos_id - sos_y = add_sos(y, sos_id=sos_id) + sos_y = add_sos(y, sos_id=blank_id) sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id) diff --git a/egs/librispeech/ASR/transducer/pretrained.py b/egs/librispeech/ASR/transducer/pretrained.py index ef5d878a5..2374de311 100755 --- a/egs/librispeech/ASR/transducer/pretrained.py +++ b/egs/librispeech/ASR/transducer/pretrained.py @@ -145,7 +145,6 @@ def get_decoder_model(params: AttributeDict): vocab_size=params.vocab_size, embedding_dim=params.decoder_embedding_dim, blank_id=params.blank_id, - sos_id=params.sos_id, num_layers=params.num_decoder_layers, hidden_dim=params.decoder_hidden_dim, output_dim=params.encoder_out_dim, @@ -211,7 +210,6 @@ def main(): # and are defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") - params.sos_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() logging.info(f"{params}") diff --git a/egs/librispeech/ASR/transducer/test_conformer.py b/egs/librispeech/ASR/transducer/test_conformer.py index 5d941d98a..9529e9c59 100755 --- a/egs/librispeech/ASR/transducer/test_conformer.py +++ b/egs/librispeech/ASR/transducer/test_conformer.py @@ -36,7 +36,6 @@ def test_conformer(): nhead=8, dim_feedforward=2048, num_encoder_layers=12, - use_feat_batchnorm=True, ) N = 3 T = 100 diff --git a/egs/librispeech/ASR/transducer/test_decoder.py b/egs/librispeech/ASR/transducer/test_decoder.py index 44c6eb6db..f0a7aa9cc 100755 --- a/egs/librispeech/ASR/transducer/test_decoder.py +++ b/egs/librispeech/ASR/transducer/test_decoder.py @@ -29,7 +29,6 @@ from decoder import Decoder def test_decoder(): vocab_size = 3 blank_id = 0 - sos_id = 2 embedding_dim = 128 num_layers = 2 hidden_dim = 6 @@ -41,7 +40,6 @@ def test_decoder(): vocab_size=vocab_size, embedding_dim=embedding_dim, blank_id=blank_id, - sos_id=sos_id, num_layers=num_layers, hidden_dim=hidden_dim, output_dim=output_dim, diff --git a/egs/librispeech/ASR/transducer/test_transducer.py b/egs/librispeech/ASR/transducer/test_transducer.py index bd4f2c188..15aa3b330 100755 --- a/egs/librispeech/ASR/transducer/test_transducer.py +++ b/egs/librispeech/ASR/transducer/test_transducer.py @@ -39,7 +39,6 @@ def test_transducer(): # decoder params vocab_size = 3 blank_id = 0 - sos_id = 2 embedding_dim = 128 num_layers = 2 @@ -51,14 +50,12 @@ def test_transducer(): nhead=8, dim_feedforward=2048, num_encoder_layers=12, - use_feat_batchnorm=True, ) decoder = Decoder( vocab_size=vocab_size, embedding_dim=embedding_dim, blank_id=blank_id, - sos_id=sos_id, num_layers=num_layers, hidden_dim=output_dim, output_dim=output_dim, diff --git a/egs/librispeech/ASR/transducer/test_transformer.py b/egs/librispeech/ASR/transducer/test_transformer.py index 8f4585504..bb68c22be 100755 --- a/egs/librispeech/ASR/transducer/test_transformer.py +++ b/egs/librispeech/ASR/transducer/test_transformer.py @@ -36,7 +36,6 @@ def test_transformer(): nhead=8, dim_feedforward=2048, num_encoder_layers=12, - use_feat_batchnorm=True, ) N = 3 T = 100 diff --git a/egs/librispeech/ASR/transducer/train.py b/egs/librispeech/ASR/transducer/train.py index 946ad2c20..2ce2fadaa 100755 --- a/egs/librispeech/ASR/transducer/train.py +++ b/egs/librispeech/ASR/transducer/train.py @@ -229,7 +229,6 @@ def get_decoder_model(params: AttributeDict): vocab_size=params.vocab_size, embedding_dim=params.decoder_embedding_dim, blank_id=params.blank_id, - sos_id=params.sos_id, num_layers=params.num_decoder_layers, hidden_dim=params.decoder_hidden_dim, output_dim=params.encoder_out_dim, @@ -567,7 +566,6 @@ def run(rank, world_size, args): # and are defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") - params.sos_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() logging.info(params)