fixed formatting issues

2025-09-06 23:54:17 +00:00 · 2023-09-24 17:02:01 +08:00 · 2023-09-24 17:02:01 +08:00 · 39cf318ba8
commit 39cf318ba8
parent 78b2279969
8 changed files with 447 additions and 400 deletions
--- a/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py
@ -230,7 +230,9 @@ class Conformer(Transformer):
                x, pos_emb, mask=mask, src_key_padding_mask=src_key_padding_mask
            )  # (T, B, F)
        else:
-            x = self.encoder(x, pos_emb, src_key_padding_mask=src_key_padding_mask)  # (T, B, F)
+            x = self.encoder(
+                x, pos_emb, src_key_padding_mask=src_key_padding_mask
+            )  # (T, B, F)

        if self.normalize_before:
            x = self.after_norm(x)
--- a/egs/librispeech/ASR/zipformer/decoder.py
+++ b/egs/librispeech/ASR/zipformer/decoder.py
@ -61,10 +61,15 @@ class Decoder(nn.Module):
        )
        # the balancers are to avoid any drift in the magnitude of the
        # embeddings, which would interact badly with parameter averaging.
-        self.balancer = Balancer(decoder_dim, channel_dim=-1,
-                                 min_positive=0.0, max_positive=1.0,
-                                 min_abs=0.5, max_abs=1.0,
-                                 prob=0.05)
+        self.balancer = Balancer(
+            decoder_dim,
+            channel_dim=-1,
+            min_positive=0.0,
+            max_positive=1.0,
+            min_abs=0.5,
+            max_abs=1.0,
+            prob=0.05,
+        )

        self.blank_id = blank_id

@ -81,10 +86,15 @@ class Decoder(nn.Module):
                groups=decoder_dim // 4,  # group size == 4
                bias=False,
            )
-            self.balancer2 = Balancer(decoder_dim, channel_dim=-1,
-                                      min_positive=0.0, max_positive=1.0,
-                                      min_abs=0.5, max_abs=1.0,
-                                      prob=0.05)
+            self.balancer2 = Balancer(
+                decoder_dim,
+                channel_dim=-1,
+                min_positive=0.0,
+                max_positive=1.0,
+                min_abs=0.5,
+                max_abs=1.0,
+                prob=0.05,
+            )

    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
        """
@ -107,9 +117,7 @@ class Decoder(nn.Module):
        if self.context_size > 1:
            embedding_out = embedding_out.permute(0, 2, 1)
            if need_pad is True:
-                embedding_out = F.pad(
-                    embedding_out, pad=(self.context_size - 1, 0)
-                )
+                embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0))
            else:
                # During inference time, there is no need to do extra padding
                # as we only need one output
--- a/egs/librispeech/ASR/zipformer/joiner.py
+++ b/egs/librispeech/ASR/zipformer/joiner.py
@ -52,12 +52,13 @@ class Joiner(nn.Module):
        Returns:
          Return a tensor of shape (N, T, s_range, C).
        """
-        assert encoder_out.ndim == decoder_out.ndim, (encoder_out.shape, decoder_out.shape)
+        assert encoder_out.ndim == decoder_out.ndim, (
+            encoder_out.shape,
+            decoder_out.shape,
+        )

        if project_input:
-            logit = self.encoder_proj(encoder_out) + self.decoder_proj(
-                decoder_out
-            )
+            logit = self.encoder_proj(encoder_out) + self.decoder_proj(decoder_out)
        else:
            logit = encoder_out + decoder_out

--- a/egs/librispeech/ASR/zipformer/onnx_decode.py
+++ b/egs/librispeech/ASR/zipformer/onnx_decode.py
@ -303,7 +303,9 @@ def main():

    for test_set, test_dl in zip(test_sets, test_dl):
        start_time = time.time()
-        results, total_duration = decode_dataset(dl=test_dl, model=model, token_table=token_table)
+        results, total_duration = decode_dataset(
+            dl=test_dl, model=model, token_table=token_table
+        )
        end_time = time.time()
        elapsed_seconds = end_time - start_time
        rtf = elapsed_seconds / total_duration
--- a/egs/librispeech/ASR/zipformer/profile.py
+++ b/egs/librispeech/ASR/zipformer/profile.py
@ -100,17 +100,13 @@ class Model(nn.Module):
        self.encoder_embed = encoder_embed
        self.encoder_proj = encoder_proj

-    def forward(
-        self, feature: Tensor, feature_lens: Tensor
-    ) -> Tuple[Tensor, Tensor]:
+    def forward(self, feature: Tensor, feature_lens: Tensor) -> Tuple[Tensor, Tensor]:
        x, x_lens = self.encoder_embed(feature, feature_lens)

        src_key_padding_mask = make_pad_mask(x_lens)
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)

-        encoder_out, encoder_out_lens = self.encoder(
-            x, x_lens, src_key_padding_mask
-        )
+        encoder_out, encoder_out_lens = self.encoder(x, x_lens, src_key_padding_mask)

        encoder_out = encoder_out.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
        logits = self.encoder_proj(encoder_out)
@ -168,9 +164,7 @@ def main():


 if __name__ == "__main__":
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)

    main()
--- a/egs/librispeech/ASR/zipformer/scaling.py
+++ b/egs/librispeech/ASR/zipformer/scaling.py
--- a/egs/librispeech/ASR/zipformer/streaming_decode.py
+++ b/egs/librispeech/ASR/zipformer/streaming_decode.py
@ -282,9 +282,7 @@ def stack_states(state_list: List[List[torch.Tensor]]) -> List[torch.Tensor]:
    )
    batch_states.append(cached_embed_left_pad)

-    processed_lens = torch.cat(
-        [state_list[i][-1] for i in range(batch_size)], dim=0
-    )
+    processed_lens = torch.cat([state_list[i][-1] for i in range(batch_size)], dim=0)
    batch_states.append(processed_lens)

    return batch_states
@ -322,9 +320,7 @@ def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
    for layer in range(tot_num_layers):
        layer_offset = layer * 6
        # cached_key: (left_context_len, batch_size, key_dim)
-        cached_key_list = batch_states[layer_offset].chunk(
-            chunks=batch_size, dim=1
-        )
+        cached_key_list = batch_states[layer_offset].chunk(chunks=batch_size, dim=1)
        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
        cached_nonlin_attn_list = batch_states[layer_offset + 1].chunk(
            chunks=batch_size, dim=1
@ -355,9 +351,7 @@ def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
                cached_conv2_list[i],
            ]

-    cached_embed_left_pad_list = batch_states[-2].chunk(
-        chunks=batch_size, dim=0
-    )
+    cached_embed_left_pad_list = batch_states[-2].chunk(chunks=batch_size, dim=0)
    for i in range(batch_size):
        state_list[i].append(cached_embed_left_pad_list[i])

@ -380,11 +374,7 @@ def streaming_forward(
    Returns encoder outputs, output lengths, and updated states.
    """
    cached_embed_left_pad = states[-2]
-    (
-        x,
-        x_lens,
-        new_cached_embed_left_pad,
-    ) = model.encoder_embed.streaming_forward(
+    (x, x_lens, new_cached_embed_left_pad,) = model.encoder_embed.streaming_forward(
        x=features,
        x_lens=feature_lens,
        cached_left_pad=cached_embed_left_pad,
@ -404,9 +394,7 @@ def streaming_forward(
    new_processed_lens = processed_lens + x_lens

    # (batch, left_context_size + chunk_size)
-    src_key_padding_mask = torch.cat(
-        [processed_mask, src_key_padding_mask], dim=1
-    )
+    src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)

    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
    encoder_states = states[:-2]
@ -494,9 +482,7 @@ def decode_one_chunk(
    encoder_out = model.joiner.encoder_proj(encoder_out)

    if params.decoding_method == "greedy_search":
-        greedy_search(
-            model=model, encoder_out=encoder_out, streams=decode_streams
-        )
+        greedy_search(model=model, encoder_out=encoder_out, streams=decode_streams)
    elif params.decoding_method == "fast_beam_search":
        processed_lens = torch.tensor(processed_lens, device=device)
        processed_lens = processed_lens + encoder_out_lens
@ -517,9 +503,7 @@ def decode_one_chunk(
            num_active_paths=params.num_active_paths,
        )
    else:
-        raise ValueError(
-            f"Unsupported decoding method: {params.decoding_method}"
-        )
+        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")

    states = unstack_states(new_states)

@ -577,9 +561,7 @@ def decode_dataset(
    decode_streams = []
    for num, cut in enumerate(cuts):
        # each utterance has a DecodeStream.
-        initial_states = get_init_states(
-            model=model, batch_size=1, device=device
-        )
+        initial_states = get_init_states(model=model, batch_size=1, device=device)
        decode_stream = DecodeStream(
            params=params,
            cut_id=cut.id,
@ -649,9 +631,7 @@ def decode_dataset(
    elif params.decoding_method == "modified_beam_search":
        key = f"num_active_paths_{params.num_active_paths}"
    else:
-        raise ValueError(
-            f"Unsupported decoding method: {params.decoding_method}"
-        )
+        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
    return {key: decode_results}


@ -684,8 +664,7 @@ def save_results(

    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
-        params.res_dir
-        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
@ -718,9 +697,7 @@ def main():
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"

    assert params.causal, params.causal
-    assert (
-        "," not in params.chunk_size
-    ), "chunk_size should be one value in decoding."
+    assert "," not in params.chunk_size, "chunk_size should be one value in decoding."
    assert (
        "," not in params.left_context_frames
    ), "left_context_frames should be one value in decoding."
@ -760,9 +737,9 @@ def main():

    if not params.use_averaged_model:
        if params.iter > 0:
-            filenames = find_checkpoints(
-                params.exp_dir, iteration=-params.iter
-            )[: params.avg]
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
@ -789,9 +766,9 @@ def main():
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
-            filenames = find_checkpoints(
-                params.exp_dir, iteration=-params.iter
-            )[: params.avg + 1]
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
--- a/egs/librispeech/ASR/zipformer/subsampling.py
+++ b/egs/librispeech/ASR/zipformer/subsampling.py
@ -107,9 +107,7 @@ class ConvNeXt(nn.Module):
        if layerdrop_rate != 0.0:
            batch_size = x.shape[0]
            mask = (
-                torch.rand(
-                    (batch_size, 1, 1, 1), dtype=x.dtype, device=x.device
-                )
+                torch.rand((batch_size, 1, 1, 1), dtype=x.dtype, device=x.device)
                > layerdrop_rate
            )
        else:
@ -278,9 +276,7 @@ class Conv2dSubsampling(nn.Module):
        # many copies of this extra gradient term.
        self.out_whiten = Whiten(
            num_groups=1,
-            whitening_limit=ScheduledFloat(
-                (0.0, 4.0), (20000.0, 8.0), default=4.0
-            ),
+            whitening_limit=ScheduledFloat((0.0, 4.0), (20000.0, 8.0), default=4.0),
            prob=(0.025, 0.25),
            grad_scale=0.02,
        )
@ -403,8 +399,8 @@ class Conv2dSubsampling(nn.Module):
        left_pad = self.convnext.padding[0]
        freq = self.out_width
        channels = self.layer3_channels
-        cached_embed_left_pad = torch.zeros(
-            batch_size, channels, left_pad, freq
-        ).to(device)
+        cached_embed_left_pad = torch.zeros(batch_size, channels, left_pad, freq).to(
+            device
+        )

        return cached_embed_left_pad