From e49fe2e7c439a1e4145035af7f2f8b4df935ba66 Mon Sep 17 00:00:00 2001 From: drawfish Date: Mon, 22 May 2023 16:46:15 +0800 Subject: [PATCH 1/5] Fixed the issue of errors in fully silent sentences during evaluation. --- .../pruned_transducer_stateless7_ctc_bs/frame_reducer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py index 0841f7cf1..9798fd4fe 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py @@ -74,9 +74,13 @@ class FrameReducer(nn.Module): padding_mask = make_pad_mask(x_lens) non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask) - if y_lens is not None: + if y_lens is not None or self.training == False: # Limit the maximum number of reduced frames - limit_lens = T - y_lens + if y_lens is not None: + limit_lens = T - y_lens + else: + # In eval mode, ensure audio that is completely silent does not make any errors + limit_lens = torch.ones_like(x_lens) max_limit_len = limit_lens.max().int() fake_limit_indexes = torch.topk( ctc_output[:, :, blank_id], max_limit_len From 8f4bfc7efd880ca53989e79bb2df7b11e126ac2e Mon Sep 17 00:00:00 2001 From: Yifan Yang <64255737+yfyeung@users.noreply.github.com> Date: Mon, 22 May 2023 16:58:10 +0800 Subject: [PATCH 2/5] Update egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py Co-authored-by: Fangjun Kuang --- .../ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py index 9798fd4fe..d48887426 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py @@ -74,7 +74,7 @@ class FrameReducer(nn.Module): padding_mask = make_pad_mask(x_lens) non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask) - if y_lens is not None or self.training == False: + if y_lens is not None or self.training is False: # Limit the maximum number of reduced frames if y_lens is not None: limit_lens = T - y_lens From 63c7402297e344deab615f89f7ede8027b42af5e Mon Sep 17 00:00:00 2001 From: drawfish Date: Tue, 23 May 2023 11:51:41 +0800 Subject: [PATCH 3/5] Update frame_reducer.py Fix mistake --- .../ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py index d48887426..671b7565f 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py @@ -80,7 +80,7 @@ class FrameReducer(nn.Module): limit_lens = T - y_lens else: # In eval mode, ensure audio that is completely silent does not make any errors - limit_lens = torch.ones_like(x_lens) + limit_lens = T - torch.ones_like(x_lens) max_limit_len = limit_lens.max().int() fake_limit_indexes = torch.topk( ctc_output[:, :, blank_id], max_limit_len From d5ad908562f432df54ec4592f89e3218bab8940f Mon Sep 17 00:00:00 2001 From: drawfish Date: Mon, 29 May 2023 16:32:49 +0800 Subject: [PATCH 4/5] Add support for export.py --- .../frame_reducer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py index 671b7565f..b33b712e4 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py @@ -74,7 +74,7 @@ class FrameReducer(nn.Module): padding_mask = make_pad_mask(x_lens) non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask) - if y_lens is not None or self.training is False: + if y_lens is not None or self.training == False: # Limit the maximum number of reduced frames if y_lens is not None: limit_lens = T - y_lens @@ -93,12 +93,12 @@ class FrameReducer(nn.Module): .to(device=x.device) ) T = torch.remainder(T, limit_lens.unsqueeze(1)) - limit_indexes = torch.gather(fake_limit_indexes, 1, T) - limit_mask = torch.full_like( + limit_indexes = torch.gather(fake_limit_indexes, 1, torch.tensor(T)) + limit_mask = (torch.full_like( non_blank_mask, - False, + 0, device=x.device, - ).scatter_(1, limit_indexes, True) + ).scatter_(1, limit_indexes, 1) == 1) non_blank_mask = non_blank_mask | ~limit_mask @@ -112,7 +112,7 @@ class FrameReducer(nn.Module): ) - out_lens ) - max_pad_len = pad_lens_list.max() + max_pad_len = int(pad_lens_list.max().item()) out = F.pad(x, (0, 0, 0, max_pad_len)) From 4f316e98d0a0c2ed939f748102a45b7803c8379d Mon Sep 17 00:00:00 2001 From: drawfish Date: Tue, 30 May 2023 12:45:27 +0800 Subject: [PATCH 5/5] Update frame_reducer.py to avoid warning on training mode. --- .../pruned_transducer_stateless7_ctc_bs/frame_reducer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py index b33b712e4..1517a494f 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py @@ -74,7 +74,7 @@ class FrameReducer(nn.Module): padding_mask = make_pad_mask(x_lens) non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask) - if y_lens is not None or self.training == False: + if y_lens is not None or self.training is False: # Limit the maximum number of reduced frames if y_lens is not None: limit_lens = T - y_lens @@ -85,15 +85,15 @@ class FrameReducer(nn.Module): fake_limit_indexes = torch.topk( ctc_output[:, :, blank_id], max_limit_len ).indices - T = ( + _T = ( torch.arange(max_limit_len) .expand_as( fake_limit_indexes, ) .to(device=x.device) ) - T = torch.remainder(T, limit_lens.unsqueeze(1)) - limit_indexes = torch.gather(fake_limit_indexes, 1, torch.tensor(T)) + _T = torch.remainder(_T, limit_lens.unsqueeze(1)) + limit_indexes = torch.gather(fake_limit_indexes, 1, _T) limit_mask = (torch.full_like( non_blank_mask, 0,