diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py index 0841f7cf1..1517a494f 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py @@ -74,27 +74,31 @@ class FrameReducer(nn.Module): padding_mask = make_pad_mask(x_lens) non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask) - if y_lens is not None: + if y_lens is not None or self.training is False: # Limit the maximum number of reduced frames - limit_lens = T - y_lens + if y_lens is not None: + limit_lens = T - y_lens + else: + # In eval mode, ensure audio that is completely silent does not make any errors + limit_lens = T - torch.ones_like(x_lens) max_limit_len = limit_lens.max().int() fake_limit_indexes = torch.topk( ctc_output[:, :, blank_id], max_limit_len ).indices - T = ( + _T = ( torch.arange(max_limit_len) .expand_as( fake_limit_indexes, ) .to(device=x.device) ) - T = torch.remainder(T, limit_lens.unsqueeze(1)) - limit_indexes = torch.gather(fake_limit_indexes, 1, T) - limit_mask = torch.full_like( + _T = torch.remainder(_T, limit_lens.unsqueeze(1)) + limit_indexes = torch.gather(fake_limit_indexes, 1, _T) + limit_mask = (torch.full_like( non_blank_mask, - False, + 0, device=x.device, - ).scatter_(1, limit_indexes, True) + ).scatter_(1, limit_indexes, 1) == 1) non_blank_mask = non_blank_mask | ~limit_mask @@ -108,7 +112,7 @@ class FrameReducer(nn.Module): ) - out_lens ) - max_pad_len = pad_lens_list.max() + max_pad_len = int(pad_lens_list.max().item()) out = F.pad(x, (0, 0, 0, max_pad_len))