diff --git a/egs/librispeech/ASR/conformer_ctc_bn_2d/conformer.py b/egs/librispeech/ASR/conformer_ctc_bn_2d/conformer.py index 448447007..0f6b8ece0 100644 --- a/egs/librispeech/ASR/conformer_ctc_bn_2d/conformer.py +++ b/egs/librispeech/ASR/conformer_ctc_bn_2d/conformer.py @@ -572,10 +572,7 @@ class BidirectionalConformer(nn.Module): tokens_padded = pad_sequence(token_ids_tensors, batch_first=True, padding_value=padding_id).to(positive_embed_shifted.device) - print("tokens_padded = ", tokens_padded) tokens_key_padding_mask = decoder_padding_mask(tokens_padded, ignore_id=padding_id) - print("tokens_key_padding_mask=", tokens_key_padding_mask) - # Let S be the length of the longest sentence (padded) token_embedding = self.token_embed(tokens_padded) * self.token_embed_scale # (N, S) -> (N, S, C) diff --git a/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py b/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py index 670665110..1b4b6f404 100755 --- a/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py +++ b/egs/librispeech/ASR/conformer_ctc_bn_2d/train.py @@ -15,6 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +#--master-port 12344 --world-size 3 --max-duration=200 --bucketing-sampler=True --start-epoch=5 + import argparse import collections @@ -22,7 +24,7 @@ import logging from pathlib import Path import random # temp.. from shutil import copyfile -from typing import Optional, Tuple +from typing import Optional, Tuple, List import k2 import torch @@ -174,6 +176,7 @@ def get_params() -> AttributeDict: "attention_dim": 512, "nhead": 8, "num_trunk_encoder_layers": 12, + "num_ctc_encoder_layers": 2, "num_decoder_layers": 6, "num_reverse_encoder_layers": 4, "num_reverse_decoder_layers": 4, @@ -285,7 +288,7 @@ class LossRecord(collections.defaultdict): # makes undefined items default to int() which is zero. super(LossRecord, self).__init__(int) - def __add__(self, other: LossRecord) -> LossRecord: + def __add__(self, other: 'LossRecord') -> 'LossRecord': ans = LossRecord() for k, v in self.items(): ans[k] = v @@ -293,7 +296,7 @@ class LossRecord(collections.defaultdict): ans[k] = ans[k] + v return ans - def __mul__(self, alpha: float) -> LossRecord: + def __mul__(self, alpha: float) -> 'LossRecord': ans = LossRecord() for k, v in self.items(): ans[k] = v * alpha @@ -303,13 +306,13 @@ class LossRecord(collections.defaultdict): def __str__(self) -> str: ans = '' for k, v in self.norm_items(): - norm_value = '%.2g' % v + norm_value = '%.4g' % v ans += (str(k) + '=' + str(norm_value) + ', ') frames = str(self['frames']) ans += 'over ' + frames + ' frames.' return ans - def norm_items(self) -> List[Tuple[string, float]] + def norm_items(self) -> List[Tuple[str, float]]: """ Returns a list of pairs, like: [('ctc_loss', 0.1), ('att_loss', 0.07)] @@ -320,7 +323,7 @@ class LossRecord(collections.defaultdict): if k != 'frames': norm_value = float(v) / num_frames ans.append((k, norm_value)) - + return ans def reduce(self, device): """ @@ -353,7 +356,7 @@ def compute_loss( batch: dict, graph_compiler: BpeCtcTrainingGraphCompiler, is_training: bool, -) -> Tuple[Tensor, LossRecord] +) -> Tuple[Tensor, LossRecord]: """ Compute loss function (including CTC, attention, and reverse-attention terms). @@ -562,7 +565,7 @@ def train_one_epoch( """ model.train() - tot_loss = LossInfo() + tot_loss = LossRecord() for batch_idx, batch in enumerate(train_dl): params.batch_idx_train += 1 @@ -679,7 +682,7 @@ def run(rank, world_size, args): num_self_predictor_layers=params.num_self_predictor_layers, subsampling_factor=params.subsampling_factor, is_bpe=params.is_bpe, - discretization_tot_classes=params.discretization_tot_clases, + discretization_tot_classes=params.discretization_tot_classes, discretization_num_groups=params.discretization_num_groups, )